from pandas import read_csv
import pandas as pd
from pandas.plotting import lag_plot
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
import os
import zipfile
import io
from sklearn.preprocessing import MinMaxScaler
import matplotlib.cm as cm
Lectura del conjunto de datos obtenido de la pagina de salud de Murcia https://www.murciasalud.es/pagina.php?id=458869 Recogidos desde el 08/03/2020 hasta el dia 25/03/2020
lectura = read_csv('../Datos_originales/casos_diarios_total_regional.csv', skiprows = 4, encoding='latin-1', sep = ';', na_values = ['NaN', 'NaT'])
# Headers = columnas de mi conjunto de datos. Si no especificamos asume que son las de la primera fila leida
# Skiprows = n: con n = numero de filas que quiero saltar. Las cuatro primeras son texto -> las saltamos
# Encoding = 'latin-1': para que no de errores de codificacion de algunos caracteres
# Sep = ';': delimitador que separa las columnas
# Index_col = 0: usar la primera columna (fechas) como filas del dataframe
Renombramos los nombres de algunas de las columnas para que queden mas compactos
datos = lectura.rename(columns={'Pruebas_PCR_Ag':'Pruebas',
'Casos_incidentes_PCR/Ag+':'Incidentes',
'Hospitalizados': 'H',
'Hospitaliz_UCI':'H_UCI',
'Asintomatico_hombre_<45años': 'Asint_H_<45',
'Asintomatico_hombre_45a64años': 'Asint_H_45a64',
'Asintomatico_hombre_+65años': 'Asint_H_+65',
'Sintomatico_hombre_<45años': 'Sint_H_<45',
'Sintomatico_hombre_45a64años': 'Sint_H_45a64',
'Sintomatico_hombre_+65años': 'Sint_H_+65',
'Asintomatico_mujer_<45años': 'Asint_M_<45',
'Asintomatico_mujer_45a64años': 'Asint_M_45a64',
'Asintomatico_mujer_+65años': 'Asint_M_+65',
'Sintomatico_mujer_<45años': 'Sint_M_<45',
'Sintomatico_mujer_45a64años': 'Sint_M_45a64',
'Sintomatico_mujer_+65años': 'Sint_M_+65'})
Convertimos los datos leidos en un dataFrame y lo mostramos para ver su apariencia
df_salud_murcia = pd.DataFrame(datos)
df_salud_murcia.head(3)
| Fecha | Pruebas | Incidentes | H | H_UCI | Defunciones | Asint_H_<45 | Asint_H_45a64 | Asint_H_+65 | Sint_H_<45 | Sint_H_45a64 | Sint_H_+65 | Asint_M_<45 | Asint_M_45a64 | Asint_M_+65 | Sint_M_<45 | Sint_M_45a64 | Sint_M_+65 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 08/03/2020 | 130.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1 | 09/03/2020 | 21.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 |
| 2 | 10/03/2020 | 39.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 |
Lo siguiente que haremos es indexar el dataframe por la columna 'Fecha'. Para ello, primero debemos convertir la misma en un objeto de tipo DatatimeIndex. Usamos la funcion 'to_datetime' y le indicamos el formato en el que se encuentran las fechas para que se lea y procese bien. Posteriormente eliminamos la columna del dataFrame e insertamos el nuevo DatatimeIndex como íncide del dataframe. Finalmente mostramos el resultado.
format = '%d/%m/%Y'
datetime_fecha = pd.to_datetime(df_salud_murcia['Fecha'], format = format)
df_salud_murcia = df_salud_murcia.drop(['Fecha'], axis=1)
df_salud_murcia.insert(0,'Fecha', datetime_fecha)
df_salud_murcia = df_salud_murcia.set_index('Fecha')
df_salud_murcia.head(3)
| Pruebas | Incidentes | H | H_UCI | Defunciones | Asint_H_<45 | Asint_H_45a64 | Asint_H_+65 | Sint_H_<45 | Sint_H_45a64 | Sint_H_+65 | Asint_M_<45 | Asint_M_45a64 | Asint_M_+65 | Sint_M_<45 | Sint_M_45a64 | Sint_M_+65 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fecha | |||||||||||||||||
| 2020-03-08 | 130.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 2020-03-09 | 21.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 |
| 2020-03-10 | 39.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 |
Con la siguiente función eliminamos todos aquellos valores que se almacenan o completan como NaN.
df_salud_murcia.dropna(axis = 0, inplace = True)
Sumamos asintomáticos y sintomáticos. Eliminamos los grupos y nos quedamos con el total de cada uno
df_salud_murcia['Asintomaticos'] = df_salud_murcia['Asint_H_<45'] + df_salud_murcia['Asint_H_45a64'] + df_salud_murcia['Asint_H_+65'] + df_salud_murcia['Asint_M_<45'] + df_salud_murcia['Asint_M_45a64'] + df_salud_murcia['Asint_M_+65']
df_salud_murcia['Sintomaticos'] = df_salud_murcia['Sint_H_<45'] + df_salud_murcia['Sint_H_45a64'] + df_salud_murcia['Sint_H_+65'] + df_salud_murcia['Sint_M_<45'] + df_salud_murcia['Sint_M_45a64'] + df_salud_murcia['Sint_M_+65']
df_salud_murcia = df_salud_murcia.drop(['Asint_H_<45', 'Asint_H_45a64', 'Asint_H_+65', 'Asint_M_<45', 'Asint_M_45a64', 'Asint_M_+65', 'Sint_H_<45', 'Sint_H_45a64', 'Sint_H_+65', 'Sint_M_<45', 'Sint_M_45a64', 'Sint_M_+65'], axis=1)
A continuación lo que hacemos es guardarla en un archivo de extensión .csv
df_salud_murcia.to_csv(path_or_buf = '../Datos_preprocesados/salud_murcia.csv')
Al igual que hicimos con el dataFrame de los datos de salud que leímos al principio, debemos transformar las fechas en objetos de tipo DatatimeIndex. Posteriormente las ordenaremos para que se muestren en orden cronológico. Para ello usamos la función 'to_datetime', que convierte nuestra lista de fechas en DatatimeIndex. Acto seguido creamos el dataframe con las entradas, las salidas a Murcia y las fechas y las ponemos como índice del dataframe. Ahora solo queda ordenarlo. Añadimos tres columnas adicionales al dataframe que son los meses, dias y años de las fechas. Ordenamos según estos valores y posteriormente las eliminamos del dataframe. Mostramos el resultado.
def conversor_a_dataframe(lista_fechas, lista_entradas, lista_salidas):
format = '%Y-%d-%m'
# Convertimos en DatatimeIndex
fechas_serie = pd.to_datetime(lista_fechas, format = format)
# Creamos dataframe con las entradas salidas y fechas
df_entradas_salidas = pd.DataFrame(list(zip(lista_entradas, lista_salidas)), columns = ['Entradas', 'Salidas'], index = fechas_serie).rename_axis('Fecha', axis = 0)
# Definimos las series para ordenar
df_entradas_salidas['month'] = fechas_serie.month
df_entradas_salidas['year'] = fechas_serie.year
df_entradas_salidas['day'] = fechas_serie.day
# Ordenamos por mes-dia-año
df_entradas_salidas = df_entradas_salidas.sort_values(['year', 'month', 'day']).drop(columns=['day', 'month', 'year'])
return df_entradas_salidas
Necesitamos añadir los datos de movilidad proporcionados por el Ministerio de Transporte y Movilidad en la Región de Murcia. Para ello se buscaron los códigos de los municipios murcianos así como sus correspondientes códigos para el Ministerio, ya que no siempre coinciden.
La siguiente función permite parsear las fechas, a partir del nombre del fichero obtenemos la fecha correspondiente
def parseador_fechas_mitma(nombre):
# Cogemos el día
dia = nombre[6:8]
mes = nombre[4:6]
anio = nombre[0:4]
return(anio+'-'+dia+'-'+mes)
La lista "codigos" contiene todos los códigos de los municipios de la Región con la codificación usada por el Ministerio
# Los diferentes al resto
codigos = ['30901', '30902']
for i in range(1,44):
if i < 10 and i not in [4,6]:
codigos.append('3000'+str(i))
if i >= 10 and i not in [14,27,29,31,32,34,39,40,42]:
codigos.append('300'+str(i))
if i == 4:
codigos.append('30014_AM')
if i == 6:
codigos.append('30039_AM')
if i == 14:
codigos.append('30014_AM')
if i == 27:
codigos.append('3002701')
if i == 29:
codigos.append('30029_AM')
if i == 31:
codigos.append('30042_AM')
if i == 32:
codigos.append('30029_AM')
if i == 34:
codigos.append('30014_AM')
if i == 39:
codigos.append('30039_AM')
if i == 40:
codigos.append('30042_AM')
if i == 42:
codigos.append('30042_AM')
Lo siguiente que haremos es procesar todos los datos que tenemos. Para cada archivo, nos quedamos con las salidas y entradas en la Región de Murcia, esto es, no se considera la movilidad entre distintos municipios murcianos si no las salidas y entradas de municipios fuera de la Región hacia ella o desde municipios de la Región hasta municipios fuera de ella
ruta = '../Datos_originales/mitma_movilidad/'
lista_fechas = []
lista_entradas = []
lista_salidas = []
# Cogemos los archivos .tar mensuales
archivos = os.listdir(ruta)
for a in archivos:
meses = os.listdir(ruta+a)
for m in meses:
if "._" not in m:
# Obtenemos la fecha
fecha = parseador_fechas_mitma(m)
lista_fechas.append(fecha)
# Leemos el archivo correspondiente
df = pd.read_csv(ruta+a+'/'+m, compression='gzip', sep='|', header=0, low_memory = False)
# Nos quedamos solo con los viajes en la Región
df_s = df[df['origen'].isin(codigos)]
# Eliminamos los viajes dentro de la Región
df_s = df_s[~df_s['destino'].isin(codigos)]
df_e = df[df['destino'].isin(codigos)]
# Eliminamos los viajes de Murcia a Murcia
df_e = df_e[~df_e['origen'].isin(codigos)]
# Sumamos todos los viajes que son las entradas y salidas
salidas = df_s['viajes'].sum()
entradas = df_e['viajes'].sum()
# Añadimos a las listas
lista_salidas.append(salidas)
lista_entradas.append(entradas)
Convertimos los datos recogidos en un dataframe mediante la función definida anteriormente
df_mitma = conversor_a_dataframe(lista_fechas, lista_entradas, lista_salidas)
df_mitma.tail(3)
| Entradas | Salidas | |
|---|---|---|
| Fecha | ||
| 2021-02-26 | 130495.728 | 126471.246 |
| 2021-02-27 | 73285.851 | 71385.928 |
| 2021-02-28 | 49817.997 | 49735.536 |
Vemos ahora el número de filas y el número de missing values
df_mitma.shape
(374, 2)
df_mitma.isna().sum()
Entradas 0 Salidas 0 dtype: int64
Guardamos el conjunto de datos con la movilidad en un nuevo archivo csv
df_mitma.to_csv(path_or_buf = '../Datos_preprocesados/movilidad.csv')
En este apartado se juntan los dataframe de las dos fuentes de datos que se han procesado. De esta forma, a los datos de salud se le añaden los datos de movilidad
df_salud_movilidad = pd.concat([df_salud_murcia, df_mitma], axis=1)
df_salud_movilidad.head(3)
| Pruebas | Incidentes | H | H_UCI | Defunciones | Asintomaticos | Sintomaticos | Entradas | Salidas | |
|---|---|---|---|---|---|---|---|---|---|
| Fecha | |||||||||
| 2020-02-21 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 166795.814 | 168567.936 |
| 2020-02-22 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 144424.897 | 144077.722 |
| 2020-02-23 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 138142.960 | 133974.435 |
df_salud_movilidad.dropna(axis = 0, inplace = True)
df_salud_movilidad.to_csv(path_or_buf = '../Datos_preprocesados/salud_movilidad.csv')
df_salud_movilidad = read_csv('../Datos_preprocesados/salud_movilidad.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
df_salud_movilidad = df_salud_movilidad.set_index('Fecha')
df_salud_movilidad.head(3)
| Pruebas | Incidentes | H | H_UCI | Defunciones | Asintomaticos | Sintomaticos | Entradas | Salidas | |
|---|---|---|---|---|---|---|---|---|---|
| Fecha | |||||||||
| 2020-03-08 | 130.0 | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | 134872.118 | 129792.956 |
| 2020-03-09 | 21.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 146586.006 | 148231.629 |
| 2020-03-10 | 39.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 145999.438 | 145006.937 |
Contamos de nuevo las filas y el número de missing values y los eliminamos
df_salud_movilidad.shape
(358, 9)
df_salud_movilidad.isna().sum()
Pruebas 0 Incidentes 0 H 0 H_UCI 0 Defunciones 0 Asintomaticos 0 Sintomaticos 0 Entradas 0 Salidas 0 dtype: int64
num = 0
# Initialize the figure style
plt.style.use('seaborn-white')
# create a color palette
palette = plt.get_cmap('prism')
for column in df_salud_movilidad:
num = num + 1
# Find the right spot on the plot
plt.subplot(7,3, num)
if "Salidas" in column or "Entradas" in column:
serie = df_salud_movilidad[column]
serie.plot(style = '.', color = palette(num), linewidth = 0.8, alpha = 0.9, label = column)
else:
# Plot the lineplot
plt.plot(df_salud_movilidad[column], marker = '', color = palette(num), linewidth = 0.8, alpha = 0.9, label = column)
# Add title
plt.title(column, loc ='center', fontsize = 12, fontweight = 0, color = palette(num))
plt.xticks([])
#plt.xticks(rotation=90)
plt.subplots_adjust(left = 0.01, right = 4, top = 5, bottom = 0.01, hspace = 0.2, wspace = 0.2)
# Show the graph
plt.show()
La siguiente función crea un predictor con el lag que se le pase como parámetro sobre el conjunto de datos
def crear_predictor(lag):
df_predictor = df_salud_movilidad.copy()
columnas = df_predictor.columns
lista = []
for c in columnas:
datos = pd.DataFrame(df_predictor[c].values, index = df_predictor.index)
for i in range(1,lag+1):
df_predictor.insert(len(df_predictor.columns), c+' t-'+str(i),datos.shift(i))
lista.append(df_predictor.columns)
columnas = columnas.drop("Incidentes")
df_predictor = df_predictor.drop(columnas, axis=1)
return df_predictor
Creamos dos predictores, uno con lag3 y otro con lag5
df_predictor_lag3 = crear_predictor(3)
df_predictor_lag3.dropna(axis = 0, inplace = True)
df_predictor_lag3.head(3)
| Incidentes | Pruebas t-1 | Pruebas t-2 | Pruebas t-3 | Incidentes t-1 | Incidentes t-2 | Incidentes t-3 | H t-1 | H t-2 | H t-3 | ... | Asintomaticos t-3 | Sintomaticos t-1 | Sintomaticos t-2 | Sintomaticos t-3 | Entradas t-1 | Entradas t-2 | Entradas t-3 | Salidas t-1 | Salidas t-2 | Salidas t-3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fecha | |||||||||||||||||||||
| 2020-03-11 | 12.0 | 39.0 | 21.0 | 130.0 | 6.0 | 3.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 2.0 | 4.0 | 4.0 | 1.0 | 145999.438 | 146586.006 | 134872.118 | 145006.937 | 148231.629 | 129792.956 |
| 2020-03-12 | 9.0 | 116.0 | 39.0 | 21.0 | 12.0 | 6.0 | 3.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 10.0 | 4.0 | 4.0 | 145431.106 | 145999.438 | 146586.006 | 143921.173 | 145006.937 | 148231.629 |
| 2020-03-13 | 12.0 | 136.0 | 116.0 | 39.0 | 9.0 | 12.0 | 6.0 | 4.0 | 0.0 | 0.0 | ... | 0.0 | 7.0 | 10.0 | 4.0 | 145470.771 | 145431.106 | 145999.438 | 144238.438 | 143921.173 | 145006.937 |
3 rows × 28 columns
df_predictor_lag3.to_csv(path_or_buf = '../Datos_preprocesados/predictor_lag3.csv')
df_predictor_lag3 = read_csv('../Datos_preprocesados/predictor_lag3.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
df_predictor_lag3 = df_predictor_lag3.set_index('Fecha')
df_predictor_lag3.head(3)
| Incidentes | Pruebas t-1 | Pruebas t-2 | Pruebas t-3 | Incidentes t-1 | Incidentes t-2 | Incidentes t-3 | H t-1 | H t-2 | H t-3 | ... | Asintomaticos t-3 | Sintomaticos t-1 | Sintomaticos t-2 | Sintomaticos t-3 | Entradas t-1 | Entradas t-2 | Entradas t-3 | Salidas t-1 | Salidas t-2 | Salidas t-3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fecha | |||||||||||||||||||||
| 2020-03-11 | 12.0 | 39.0 | 21.0 | 130.0 | 6.0 | 3.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 2.0 | 4.0 | 4.0 | 1.0 | 145999.438 | 146586.006 | 134872.118 | 145006.937 | 148231.629 | 129792.956 |
| 2020-03-12 | 9.0 | 116.0 | 39.0 | 21.0 | 12.0 | 6.0 | 3.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 10.0 | 4.0 | 4.0 | 145431.106 | 145999.438 | 146586.006 | 143921.173 | 145006.937 | 148231.629 |
| 2020-03-13 | 12.0 | 136.0 | 116.0 | 39.0 | 9.0 | 12.0 | 6.0 | 4.0 | 0.0 | 0.0 | ... | 0.0 | 7.0 | 10.0 | 4.0 | 145470.771 | 145431.106 | 145999.438 | 144238.438 | 143921.173 | 145006.937 |
3 rows × 28 columns
df_predictor_lag5 = crear_predictor(5)
df_predictor_lag5.dropna(axis = 0, inplace = True)
df_predictor_lag5.head(3)
| Incidentes | Pruebas t-1 | Pruebas t-2 | Pruebas t-3 | Pruebas t-4 | Pruebas t-5 | Incidentes t-1 | Incidentes t-2 | Incidentes t-3 | Incidentes t-4 | ... | Entradas t-1 | Entradas t-2 | Entradas t-3 | Entradas t-4 | Entradas t-5 | Salidas t-1 | Salidas t-2 | Salidas t-3 | Salidas t-4 | Salidas t-5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fecha | |||||||||||||||||||||
| 2020-03-13 | 12.0 | 136.0 | 116.0 | 39.0 | 21.0 | 130.0 | 9.0 | 12.0 | 6.0 | 3.0 | ... | 145470.771 | 145431.106 | 145999.438 | 146586.006 | 134872.118 | 144238.438 | 143921.173 | 145006.937 | 148231.629 | 129792.956 |
| 2020-03-14 | 23.0 | 228.0 | 136.0 | 116.0 | 39.0 | 21.0 | 12.0 | 9.0 | 12.0 | 6.0 | ... | 143647.028 | 145470.771 | 145431.106 | 145999.438 | 146586.006 | 140719.734 | 144238.438 | 143921.173 | 145006.937 | 148231.629 |
| 2020-03-15 | 18.0 | 196.0 | 228.0 | 136.0 | 116.0 | 39.0 | 23.0 | 12.0 | 9.0 | 12.0 | ... | 71089.567 | 143647.028 | 145470.771 | 145431.106 | 145999.438 | 69784.739 | 140719.734 | 144238.438 | 143921.173 | 145006.937 |
3 rows × 46 columns
df_predictor_lag5.to_csv(path_or_buf = '../Datos_preprocesados/predictor_lag5.csv')
df_predictor_lag5 = read_csv('../Datos_preprocesados/predictor_lag5.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
df_predictor_lag5 = df_predictor_lag5.set_index('Fecha')
df_predictor_lag5.head(3)
| Incidentes | Pruebas t-1 | Pruebas t-2 | Pruebas t-3 | Pruebas t-4 | Pruebas t-5 | Incidentes t-1 | Incidentes t-2 | Incidentes t-3 | Incidentes t-4 | ... | Entradas t-1 | Entradas t-2 | Entradas t-3 | Entradas t-4 | Entradas t-5 | Salidas t-1 | Salidas t-2 | Salidas t-3 | Salidas t-4 | Salidas t-5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fecha | |||||||||||||||||||||
| 2020-03-13 | 12.0 | 136.0 | 116.0 | 39.0 | 21.0 | 130.0 | 9.0 | 12.0 | 6.0 | 3.0 | ... | 145470.771 | 145431.106 | 145999.438 | 146586.006 | 134872.118 | 144238.438 | 143921.173 | 145006.937 | 148231.629 | 129792.956 |
| 2020-03-14 | 23.0 | 228.0 | 136.0 | 116.0 | 39.0 | 21.0 | 12.0 | 9.0 | 12.0 | 6.0 | ... | 143647.028 | 145470.771 | 145431.106 | 145999.438 | 146586.006 | 140719.734 | 144238.438 | 143921.173 | 145006.937 | 148231.629 |
| 2020-03-15 | 18.0 | 196.0 | 228.0 | 136.0 | 116.0 | 39.0 | 23.0 | 12.0 | 9.0 | 12.0 | ... | 71089.567 | 143647.028 | 145470.771 | 145431.106 | 145999.438 | 69784.739 | 140719.734 | 144238.438 | 143921.173 | 145006.937 |
3 rows × 46 columns
Utilizamos la función MaxMinScaler para normalizar el conjunto de datos entre 0 y 1
def escalar_datos(df):
index = df.index
scaler = MinMaxScaler()
escalado = scaler.fit_transform(df)
df_escalado = pd.DataFrame(escalado, columns = df.columns).set_index(index)
return df_escalado
Escalamos los predictores de lag3 y lag5 calculados anteriormente
df_predictor_lag3_escalado = escalar_datos(df_predictor_lag3)
df_predictor_lag3_escalado.head(3)
| Incidentes | Pruebas t-1 | Pruebas t-2 | Pruebas t-3 | Incidentes t-1 | Incidentes t-2 | Incidentes t-3 | H t-1 | H t-2 | H t-3 | ... | Asintomaticos t-3 | Sintomaticos t-1 | Sintomaticos t-2 | Sintomaticos t-3 | Entradas t-1 | Entradas t-2 | Entradas t-3 | Salidas t-1 | Salidas t-2 | Salidas t-3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fecha | |||||||||||||||||||||
| 2020-03-11 | 0.005256 | 0.003270 | 0.001761 | 0.010901 | 0.002628 | 0.001314 | 0.000438 | 0.000000 | 0.0 | 0.0 | ... | 0.00354 | 0.002415 | 0.002415 | 0.000604 | 0.371644 | 0.373381 | 0.338683 | 0.335522 | 0.344440 | 0.293448 |
| 2020-03-12 | 0.003942 | 0.009727 | 0.003270 | 0.001761 | 0.005256 | 0.002628 | 0.001314 | 0.000000 | 0.0 | 0.0 | ... | 0.00000 | 0.006039 | 0.002415 | 0.002415 | 0.369960 | 0.371644 | 0.373381 | 0.332519 | 0.335522 | 0.344440 |
| 2020-03-13 | 0.005256 | 0.011404 | 0.009727 | 0.003270 | 0.003942 | 0.005256 | 0.002628 | 0.003521 | 0.0 | 0.0 | ... | 0.00000 | 0.004227 | 0.006039 | 0.002415 | 0.370078 | 0.369960 | 0.371644 | 0.333397 | 0.332519 | 0.335522 |
3 rows × 28 columns
df_predictor_lag3_escalado.to_csv(path_or_buf = '../Datos_preprocesados/predictor_lag3_escalado.csv')
df_predictor_lag3_escalado = read_csv('../Datos_preprocesados/predictor_lag3_escalado.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
df_predictor_lag3_escalado = df_predictor_lag3_escalado.set_index('Fecha')
df_predictor_lag3_escalado.head(3)
| Incidentes | Pruebas t-1 | Pruebas t-2 | Pruebas t-3 | Incidentes t-1 | Incidentes t-2 | Incidentes t-3 | H t-1 | H t-2 | H t-3 | ... | Asintomaticos t-3 | Sintomaticos t-1 | Sintomaticos t-2 | Sintomaticos t-3 | Entradas t-1 | Entradas t-2 | Entradas t-3 | Salidas t-1 | Salidas t-2 | Salidas t-3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fecha | |||||||||||||||||||||
| 2020-03-11 | 0.005256 | 0.003270 | 0.001761 | 0.010901 | 0.002628 | 0.001314 | 0.000438 | 0.000000 | 0.0 | 0.0 | ... | 0.00354 | 0.002415 | 0.002415 | 0.000604 | 0.371644 | 0.373381 | 0.338683 | 0.335522 | 0.344440 | 0.293448 |
| 2020-03-12 | 0.003942 | 0.009727 | 0.003270 | 0.001761 | 0.005256 | 0.002628 | 0.001314 | 0.000000 | 0.0 | 0.0 | ... | 0.00000 | 0.006039 | 0.002415 | 0.002415 | 0.369960 | 0.371644 | 0.373381 | 0.332519 | 0.335522 | 0.344440 |
| 2020-03-13 | 0.005256 | 0.011404 | 0.009727 | 0.003270 | 0.003942 | 0.005256 | 0.002628 | 0.003521 | 0.0 | 0.0 | ... | 0.00000 | 0.004227 | 0.006039 | 0.002415 | 0.370078 | 0.369960 | 0.371644 | 0.333397 | 0.332519 | 0.335522 |
3 rows × 28 columns
df_predictor_lag5_escalado = escalar_datos(df_predictor_lag5)
df_predictor_lag5_escalado.head(3)
| Incidentes | Pruebas t-1 | Pruebas t-2 | Pruebas t-3 | Pruebas t-4 | Pruebas t-5 | Incidentes t-1 | Incidentes t-2 | Incidentes t-3 | Incidentes t-4 | ... | Entradas t-1 | Entradas t-2 | Entradas t-3 | Entradas t-4 | Entradas t-5 | Salidas t-1 | Salidas t-2 | Salidas t-3 | Salidas t-4 | Salidas t-5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fecha | |||||||||||||||||||||
| 2020-03-13 | 0.005256 | 0.011404 | 0.009727 | 0.003270 | 0.001761 | 0.010901 | 0.003942 | 0.005256 | 0.002628 | 0.001314 | ... | 0.370078 | 0.369960 | 0.371644 | 0.373381 | 0.338683 | 0.333397 | 0.332519 | 0.335522 | 0.344440 | 0.293448 |
| 2020-03-14 | 0.010074 | 0.019118 | 0.011404 | 0.009727 | 0.003270 | 0.001761 | 0.005256 | 0.003942 | 0.005256 | 0.002628 | ... | 0.364676 | 0.370078 | 0.369960 | 0.371644 | 0.373381 | 0.323666 | 0.333397 | 0.332519 | 0.335522 | 0.344440 |
| 2020-03-15 | 0.007884 | 0.016435 | 0.019118 | 0.011404 | 0.009727 | 0.003270 | 0.010074 | 0.005256 | 0.003942 | 0.005256 | ... | 0.149752 | 0.364676 | 0.370078 | 0.369960 | 0.371644 | 0.127497 | 0.323666 | 0.333397 | 0.332519 | 0.335522 |
3 rows × 46 columns
df_predictor_lag5_escalado.to_csv(path_or_buf = '../Datos_preprocesados/predictor_lag5_escalado.csv')
df_predictor_lag5_escalado = read_csv('../Datos_preprocesados/predictor_lag5_escalado.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
df_predictor_lag5_escalado = df_predictor_lag5_escalado.set_index('Fecha')
df_predictor_lag5_escalado.head(3)
| Incidentes | Pruebas t-1 | Pruebas t-2 | Pruebas t-3 | Pruebas t-4 | Pruebas t-5 | Incidentes t-1 | Incidentes t-2 | Incidentes t-3 | Incidentes t-4 | ... | Entradas t-1 | Entradas t-2 | Entradas t-3 | Entradas t-4 | Entradas t-5 | Salidas t-1 | Salidas t-2 | Salidas t-3 | Salidas t-4 | Salidas t-5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fecha | |||||||||||||||||||||
| 2020-03-13 | 0.005256 | 0.011404 | 0.009727 | 0.003270 | 0.001761 | 0.010901 | 0.003942 | 0.005256 | 0.002628 | 0.001314 | ... | 0.370078 | 0.369960 | 0.371644 | 0.373381 | 0.338683 | 0.333397 | 0.332519 | 0.335522 | 0.344440 | 0.293448 |
| 2020-03-14 | 0.010074 | 0.019118 | 0.011404 | 0.009727 | 0.003270 | 0.001761 | 0.005256 | 0.003942 | 0.005256 | 0.002628 | ... | 0.364676 | 0.370078 | 0.369960 | 0.371644 | 0.373381 | 0.323666 | 0.333397 | 0.332519 | 0.335522 | 0.344440 |
| 2020-03-15 | 0.007884 | 0.016435 | 0.019118 | 0.011404 | 0.009727 | 0.003270 | 0.010074 | 0.005256 | 0.003942 | 0.005256 | ... | 0.149752 | 0.364676 | 0.370078 | 0.369960 | 0.371644 | 0.127497 | 0.323666 | 0.333397 | 0.332519 | 0.335522 |
3 rows × 46 columns
# Imports
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
import sklearn.metrics as sm
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import math
# Para guardar los modelos entrenados
import joblib as joblib
# Para crossvalidación
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
Usamos el cross validator de time series con 10 pliegues
cv = TimeSeriesSplit(n_splits = 10)
La siguiente función busca los mejores hiperparámetros usando como medida de error RMSE
def search_grid(estimador, grid, X_train, y_train):
# Buscamos los mejores hiperparametros
clf = GridSearchCV(estimator=estimador,
param_grid=grid,
cv=cv,
#refit=True,
#error_score=0,
# POR DEFECTO -> R^2
#scoring = miscorer,#'neg_root_mean_squared_error',
scoring = 'neg_root_mean_squared_error',
#scoring = miscorer,
n_jobs=-1,
return_train_score=True)
clf.fit(X_train, y_train)
return clf
Cálculo de errores MAE, RMSE y R2
def MAE(y_pred, y_test):
mae = round(sm.mean_absolute_error(y_test, y_pred), 2)
print("Mean absolute error =", mae)
return mae
def RMSE(y_pred, y_test):
rmse = round(math.sqrt(sm.mean_squared_error(y_test, y_pred)), 2)
print("RMSE =", rmse)
return rmse
def R2(y_pred, y_test):
r2 = round(sm.r2_score(y_test, y_pred), 2)
print("R2 =", r2)
return r2
Función para calcular la gráfica de comparación entre incidentes reales y predecidos en el instante t+i
def grafica(p, t, i):
#plt.subplot(3,3,1)
pd.Series(p).plot(color = 'darkorange', linewidth = 1.4, alpha = 0.9, label = 'Incidentes predecidos')
t.plot(color = 'deepskyblue', linewidth = 1.4, alpha = 0.9, label = 'Incidentes Reales')
plt.title('Indicentes t+'+str(i+1), loc ='center', fontsize = 12, fontweight = 0, color = 'black')
plt.legend(loc='best')
plt.xticks(rotation=90)
plt.show()
Función para el cálculo de las predicciones 7 días adelante a partir de la predicción t para el modelo indicado
def calcular_predicciones_lag3(modelo, y_pred, X_test):
predicciones = []
X_test_modelo = X_test.copy()
for i in range(1,8):
# Modificamos el conjunto de test
X_test_modelo['Incidentes t-3'] = X_test_modelo['Incidentes t-2']
X_test_modelo['Incidentes t-2'] = X_test_modelo['Incidentes t-1']
X_test_modelo['Incidentes t-1'] = y_pred
# Predecimos
y_pred = modelo.predict(X_test_modelo)
# Ajustamos al tamaño
pred = y_pred[i:]
predicciones.append(pred)
return predicciones
def calcular_predicciones_lag5(modelo, y_pred, X_test):
predicciones = []
X_test_modelo = X_test.copy()
for i in range(1,8):
# Modificamos el conjunto de test
X_test_modelo['Incidentes t-5'] = X_test_modelo['Incidentes t-4']
X_test_modelo['Incidentes t-4'] = X_test_modelo['Incidentes t-3']
X_test_modelo['Incidentes t-3'] = X_test_modelo['Incidentes t-2']
X_test_modelo['Incidentes t-2'] = X_test_modelo['Incidentes t-1']
X_test_modelo['Incidentes t-1'] = y_pred
# Predecimos
y_pred = modelo.predict(X_test_modelo)
# Ajustamos al tamaño
pred = y_pred[i:]
predicciones.append(pred)
return predicciones
Función para calcular los errores y las gráficas asociadas a las predicciones realizadas en la función anterior
def graficas_errores(predicciones, y_test):
erroresMAE = []
erroresRMSE = []
erroresR2 = []
test = y_test.copy()
for i in range(1,len(predicciones)+1):
# Tomo la predicción en t+i
s = pd.Series(predicciones[i-1])
# Elimino el día que ya no uso
test = test.drop([test.index[0]])
grafica(s,test,i-1)
# Añado los errores a la lista de errores
erroresMAE.append(MAE(predicciones[i-1],test))
erroresRMSE.append(RMSE(predicciones[i-1],test))
erroresR2.append(R2(predicciones[i-1],test))
return erroresMAE, erroresRMSE, erroresR2
Visualización de todas las predicciones juntas
def visualizar_predicciones(y_test, predicciones):
# TODO : CREO NO NECESARIO
test = y_test.copy()
palette = plt.get_cmap('prism')
for i in range(1,len(predicciones)+1):
# Tomo la predicción en t+i
s = pd.Series(predicciones[i-1])
# Para mostrarla, la desplazo i valores a la derecha para colocarla en el dia correspondiente de prediccion
s = s.shift(+i)
s.dropna(axis = 0, inplace = True)
s.plot(color = palette(i), linewidth = 1.4, alpha = 0.9, label = 'Incidentes predecidos t+'+str(i))
# TODO : CREO NO NECESARIO
test = test.drop([test.index[0]])
y_test.plot(color = 'deepskyblue', linewidth = 1.4, alpha = 0.9, label = 'Incidentes Reales')
plt.title('Evolución incidentes', loc ='center', fontsize = 12, fontweight = 0, color = 'black')
plt.legend(loc='best')
plt.xticks(rotation=90)
plt.show()
Dividimos el conjunto de datos con una distribución 80% para train y 20% para test sin perder la temporalidad
El caso de lag3 el conjunto de datos es de 355x28. El 80% de las filas son 355x0'8 = 284 mientras que el 20% son 355x0'2 = 71 así que tendremos 284 muestras para train y 71 para test
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(df_predictor_lag3_escalado.drop(['Incidentes'], axis=1),
df_predictor_lag3_escalado['Incidentes'], train_size = 0.8, test_size = 0.2,
random_state = 42, shuffle = False)
En el caso de lag5 el conjunto de datos es de 353x28. El 80% de las filas son 353x0'8 = 282'4 -> 282 mientras que el 20% son 353x0'2 = 70'6 así que por el redondeo, tendremos 282 muestras para train y 71 para test
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(df_predictor_lag5_escalado.drop(['Incidentes'], axis=1),
df_predictor_lag5_escalado['Incidentes'], train_size = 0.8, test_size = 0.2,
random_state = 42, shuffle = False)
rf = RandomForestRegressor(random_state = 42)
Lo siguiente que hacemos es hacer la búsqueda de hiperparámetros, para ello usaremos GridSearchCV, que prueba todas las posibles combinaciones de parámetros que se le pase usando crossvalidación. Veamos cuáles son los hiperparámetros de RF y sus valores por defecto
rf.get_params()
grid = {
'bootstrap': [True],
'max_depth': [20, 50, 60, 70, 80],
'max_features': [2, 3, 5],
'min_samples_leaf': [1, 2, 4, 5],
'min_samples_split': [4, 8, 10, 14],
'n_estimators': [100, 300, 500, 1000, 1500]
}
clf_rf = search_grid(rf, grid, X_train_3, y_train_3)
optimised_rf = clf_rf.best_estimator_
clf_rf.best_params_
{'bootstrap': True,
'max_depth': 20,
'max_features': 5,
'min_samples_leaf': 1,
'min_samples_split': 10,
'n_estimators': 300}
rf_results = pd.DataFrame(clf_rf.cv_results_)
rf_results.head(3)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_bootstrap | param_max_depth | param_max_features | param_min_samples_leaf | param_min_samples_split | param_n_estimators | ... | split2_train_score | split3_train_score | split4_train_score | split5_train_score | split6_train_score | split7_train_score | split8_train_score | split9_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.230483 | 0.010087 | 0.013764 | 0.001247 | True | 20 | 2 | 1 | 4 | 100 | ... | -0.002737 | -0.002415 | -0.002494 | -0.003163 | -0.005832 | -0.008506 | -0.012143 | -0.017030 | -0.006214 | 0.004670 |
| 1 | 0.822700 | 0.049547 | 0.047374 | 0.008924 | True | 20 | 2 | 1 | 4 | 300 | ... | -0.002774 | -0.002479 | -0.002548 | -0.003127 | -0.005801 | -0.008935 | -0.012458 | -0.017392 | -0.006318 | 0.004814 |
| 2 | 1.210761 | 0.127360 | 0.057546 | 0.009420 | True | 20 | 2 | 1 | 4 | 500 | ... | -0.002825 | -0.002459 | -0.002506 | -0.003088 | -0.005762 | -0.008826 | -0.012337 | -0.017481 | -0.006297 | 0.004816 |
3 rows × 36 columns
joblib.dump(optimised_rf, 'rf_lag3_escalado.pkl')
['rf_lag3_escalado.pkl']
optimised_rf_lag3 = joblib.load('rf_lag3_escalado.pkl')
y_pred = optimised_rf_lag3.predict(X_test_3)
RMSEs_test_lag3_rf = []
MAEs_test_lag3_rf = []
R2s_test_lag3_rf = []
RMSEs_test_lag3_rf.append(RMSE(y_pred, y_test_3))
MAEs_test_lag3_rf.append(MAE(y_pred, y_test_3))
R2s_test_lag3_rf.append(R2(y_pred, y_test_3))
RMSE = 0.22 Mean absolute error = 0.14 R2 = 0.4
grafica(y_pred, y_test_3, -1)
np.corrcoef(y_test_3, y_pred)
array([[1. , 0.84651874],
[0.84651874, 1. ]])
predicciones_rf_lag3 = calcular_predicciones_lag3(optimised_rf_lag3, y_pred, X_test_3)
visualizar_predicciones(y_test_3, predicciones_rf_lag3)
MAE_rf, RMSE_rf, R2_rf = graficas_errores(predicciones_rf_lag3, y_test_3)
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.37
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.36
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.36
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.36
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.35
Mean absolute error = 0.15 RMSE = 0.24 R2 = 0.34
Mean absolute error = 0.16 RMSE = 0.24 R2 = 0.34
for i in range(0, len(RMSE_rf)):
RMSEs_test_lag3_rf.append(RMSE_rf[i])
MAEs_test_lag3_rf.append(MAE_rf[i])
R2s_test_lag3_rf.append(R2_rf[i])
clf_rf = search_grid(rf, grid, X_train_5, y_train_5)
optimised_rf = clf_rf.best_estimator_
clf_rf.best_params_
{'bootstrap': True,
'max_depth': 20,
'max_features': 5,
'min_samples_leaf': 2,
'min_samples_split': 8,
'n_estimators': 100}
rf_results = pd.DataFrame(clf_rf.cv_results_)
rf_results.head(3)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_bootstrap | param_max_depth | param_max_features | param_min_samples_leaf | param_min_samples_split | param_n_estimators | ... | split2_train_score | split3_train_score | split4_train_score | split5_train_score | split6_train_score | split7_train_score | split8_train_score | split9_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.238150 | 0.032929 | 0.015190 | 0.000892 | True | 20 | 2 | 1 | 4 | 100 | ... | -0.002989 | -0.002695 | -0.002551 | -0.003324 | -0.006114 | -0.008909 | -0.012888 | -0.017250 | -0.006460 | 0.004775 |
| 1 | 0.714150 | 0.050421 | 0.033534 | 0.007744 | True | 20 | 2 | 1 | 4 | 300 | ... | -0.002869 | -0.002493 | -0.002467 | -0.003173 | -0.005845 | -0.008784 | -0.012696 | -0.017192 | -0.006311 | 0.004789 |
| 2 | 1.208178 | 0.081138 | 0.063989 | 0.010854 | True | 20 | 2 | 1 | 4 | 500 | ... | -0.002908 | -0.002511 | -0.002490 | -0.003147 | -0.005818 | -0.009112 | -0.012635 | -0.017075 | -0.006328 | 0.004769 |
3 rows × 36 columns
joblib.dump(optimised_rf, 'rf_lag5_escalado.pkl')
['rf_lag5_escalado.pkl']
optimised_rf_lag5 = joblib.load('rf_lag5_escalado.pkl')
y_pred = optimised_rf_lag5.predict(X_test_5)
RMSEs_test_lag5_rf = []
MAEs_test_lag5_rf = []
R2s_test_lag5_rf = []
RMSEs_test_lag5_rf.append(RMSE(y_pred, y_test_5))
MAEs_test_lag5_rf.append(MAE(y_pred, y_test_5))
R2s_test_lag5_rf.append(R2(y_pred, y_test_5))
RMSE = 0.23 Mean absolute error = 0.14 R2 = 0.37
grafica(y_pred, y_test_5, -1)
np.corrcoef(y_test_5, y_pred)
array([[1. , 0.85867788],
[0.85867788, 1. ]])
predicciones_rf_lag5 = calcular_predicciones_lag5(optimised_rf_lag5, y_pred, X_test_5)
visualizar_predicciones(y_test_5, predicciones_rf_lag5)
MAE_rf, RMSE_rf, R2_rf = graficas_errores(predicciones_rf_lag5, y_test_5)
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.35
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.35
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.35
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.35
Mean absolute error = 0.15 RMSE = 0.24 R2 = 0.34
Mean absolute error = 0.16 RMSE = 0.24 R2 = 0.33
Mean absolute error = 0.16 RMSE = 0.24 R2 = 0.33
for i in range(0, len(RMSE_rf)):
RMSEs_test_lag5_rf.append(RMSE_rf[i])
MAEs_test_lag5_rf.append(MAE_rf[i])
R2s_test_lag5_rf.append(R2_rf[i])
regresion_lineal = LinearRegression()
regresion_lineal.fit(X_train_3, y_train_3)
LinearRegression()
joblib.dump(regresion_lineal, 'regresion_lineal_lag3_escalado.pkl')
['regresion_lineal_lag3_escalado.pkl']
regresion_lineal_lag3 = joblib.load('regresion_lineal_lag3_escalado.pkl')
y_pred = regresion_lineal_lag3.predict(X_test_3)
RMSEs_test_lag3_LR = []
MAEs_test_lag3_LR = []
R2s_test_lag3_LR = []
RMSEs_test_lag3_LR.append(RMSE(y_pred, y_test_3))
MAEs_test_lag3_LR.append(MAE(y_pred, y_test_3))
R2s_test_lag3_LR.append(R2(y_pred, y_test_3))
RMSE = 0.13 Mean absolute error = 0.09 R2 = 0.78
grafica(y_pred, y_test_3, -1)
np.corrcoef(y_test_3, y_pred)
array([[1. , 0.90277771],
[0.90277771, 1. ]])
Creamos una lista donde irán las predicciones t+i con i = 1,...,7
predicciones_LR_lag3 = calcular_predicciones_lag3(regresion_lineal_lag3, y_pred, X_test_3)
visualizar_predicciones(y_test_3, predicciones_LR_lag3)
Calculamos las gráficas de las predicciones así como los errores
MAE_LR, RMSE_LR, R2_LR = graficas_errores(predicciones_LR_lag3, y_test_3)
Mean absolute error = 0.11 RMSE = 0.18 R2 = 0.61
Mean absolute error = 0.14 RMSE = 0.21 R2 = 0.46
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.39
Mean absolute error = 0.17 RMSE = 0.27 R2 = 0.13
Mean absolute error = 0.19 RMSE = 0.3 R2 = -0.03
Mean absolute error = 0.21 RMSE = 0.32 R2 = -0.17
Mean absolute error = 0.23 RMSE = 0.35 R2 = -0.41
for i in range(0, len(RMSE_LR)):
RMSEs_test_lag3_LR.append(RMSE_LR[i])
MAEs_test_lag3_LR.append(MAE_LR[i])
R2s_test_lag3_LR.append(R2_LR[i])
regresion_lineal.fit(X_train_5, y_train_5)
LinearRegression()
joblib.dump(regresion_lineal, 'regresion_lineal_lag5_escalado.pkl')
['regresion_lineal_lag5_escalado.pkl']
regresion_lineal_lag5 = joblib.load('regresion_lineal_lag5_escalado.pkl')
y_pred = regresion_lineal_lag5.predict(X_test_5)
RMSEs_test_lag5_LR = []
MAEs_test_lag5_LR = []
R2s_test_lag5_LR = []
RMSEs_test_lag5_LR.append(RMSE(y_pred, y_test_5))
MAEs_test_lag5_LR.append(MAE(y_pred, y_test_5))
R2s_test_lag5_LR.append(R2(y_pred, y_test_5))
RMSE = 0.13 Mean absolute error = 0.08 R2 = 0.8
grafica(y_pred, y_test_5, -1)
predicciones_LR_lag5 = calcular_predicciones_lag5(regresion_lineal_lag5, y_pred, X_test_5)
visualizar_predicciones(y_test_5, predicciones_LR_lag5)
MAE_LR, RMSE_LR, R2_LR = graficas_errores(predicciones_LR_lag5, y_test_5)
Mean absolute error = 0.11 RMSE = 0.16 R2 = 0.69
Mean absolute error = 0.12 RMSE = 0.17 R2 = 0.65
Mean absolute error = 0.13 RMSE = 0.19 R2 = 0.56
Mean absolute error = 0.17 RMSE = 0.25 R2 = 0.28
Mean absolute error = 0.17 RMSE = 0.25 R2 = 0.24
Mean absolute error = 0.21 RMSE = 0.3 R2 = -0.09
Mean absolute error = 0.23 RMSE = 0.33 R2 = -0.28
for i in range(0, len(RMSE_LR)):
RMSEs_test_lag5_LR.append(RMSE_LR[i])
MAEs_test_lag5_LR.append(MAE_LR[i])
R2s_test_lag5_LR.append(R2_LR[i])
svr = svm.SVR()
svr.get_params()
{'C': 1.0,
'cache_size': 200,
'coef0': 0.0,
'degree': 3,
'epsilon': 0.1,
'gamma': 'scale',
'kernel': 'rbf',
'max_iter': -1,
'shrinking': True,
'tol': 0.001,
'verbose': False}
grid = {'C' : [0.25, 0.5, 1, 2, 4, 8, 16, 32],
#'C' : [0.001, 0.01, 0.1, 1],
'epsilon' : [0.1, 0.01, 0.05],
'gamma' : ['auto'],
'degree' : [2,3,4],
'kernel' : ['sigmoid', 'linear', 'poly'],
'coef0' : [0.0, 0.01, 0.05, 0.1]}
clf_svr = search_grid(svr, grid, X_train_3, y_train_3)
clf_svr.best_params_
{'C': 32,
'coef0': 0.0,
'degree': 2,
'epsilon': 0.01,
'gamma': 'auto',
'kernel': 'linear'}
svr_results = pd.DataFrame(clf_svr.cv_results_)
svr_results.head(3)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_C | param_coef0 | param_degree | param_epsilon | param_gamma | param_kernel | ... | split2_train_score | split3_train_score | split4_train_score | split5_train_score | split6_train_score | split7_train_score | split8_train_score | split9_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.004388 | 0.001111 | 0.003491 | 0.002241 | 0.25 | 0 | 2 | 0.1 | auto | sigmoid | ... | -0.020984 | -0.021895 | -0.021811 | -0.027055 | -0.086278 | -0.084233 | -0.079713 | -0.081937 | -0.045886 | 0.030491 |
| 1 | 0.004787 | 0.002742 | 0.007481 | 0.008582 | 0.25 | 0 | 2 | 0.1 | auto | linear | ... | -0.020984 | -0.021895 | -0.021811 | -0.027055 | -0.085420 | -0.070687 | -0.040356 | -0.057981 | -0.038114 | 0.023457 |
| 2 | 0.003989 | 0.000892 | 0.006084 | 0.006517 | 0.25 | 0 | 2 | 0.1 | auto | poly | ... | -0.020984 | -0.021895 | -0.021811 | -0.027055 | -0.086582 | -0.088656 | -0.097484 | -0.098198 | -0.049762 | 0.035335 |
3 rows × 36 columns
optimised_svr = clf_svr.best_estimator_
joblib.dump(optimised_svr, 'svr_lag3_escalado.pkl')
['svr_lag3_escalado.pkl']
optimised_svr_lag3 = joblib.load('svr_lag3_escalado.pkl')
y_pred = optimised_svr_lag3.predict(X_test_3)
RMSEs_test_lag3_SVR = []
MAEs_test_lag3_SVR = []
R2s_test_lag3_SVR = []
RMSEs_test_lag3_SVR.append(RMSE(y_pred, y_test_3))
MAEs_test_lag3_SVR.append(MAE(y_pred, y_test_3))
R2s_test_lag3_SVR.append(R2(y_pred, y_test_3))
RMSE = 0.13 Mean absolute error = 0.09 R2 = 0.8
grafica(y_pred, y_test_3, -1)
predicciones_svr_lag3 = calcular_predicciones_lag3(optimised_svr_lag3, y_pred, X_test_3)
visualizar_predicciones(y_test_3, predicciones_svr_lag3)
MAE_SVR, RMSE_SVR, R2_SVR = graficas_errores(predicciones_svr_lag3, y_test_3)
Mean absolute error = 0.1 RMSE = 0.16 R2 = 0.69
Mean absolute error = 0.12 RMSE = 0.19 R2 = 0.58
Mean absolute error = 0.13 RMSE = 0.2 R2 = 0.53
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.38
Mean absolute error = 0.17 RMSE = 0.26 R2 = 0.23
Mean absolute error = 0.18 RMSE = 0.27 R2 = 0.12
Mean absolute error = 0.2 RMSE = 0.3 R2 = -0.04
for i in range(0, len(RMSE_SVR)):
RMSEs_test_lag3_SVR.append(RMSE_SVR[i])
MAEs_test_lag3_SVR.append(MAE_SVR[i])
R2s_test_lag3_SVR.append(R2_SVR[i])
clf_svr = search_grid(svr, grid, X_train_5, y_train_5)
clf_svr.best_params_
{'C': 16,
'coef0': 0.0,
'degree': 2,
'epsilon': 0.01,
'gamma': 'auto',
'kernel': 'linear'}
svr_results = pd.DataFrame(clf_svr.cv_results_)
svr_results.head(3)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_C | param_coef0 | param_degree | param_epsilon | param_gamma | param_kernel | ... | split2_train_score | split3_train_score | split4_train_score | split5_train_score | split6_train_score | split7_train_score | split8_train_score | split9_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.006470 | 0.004916 | 0.003191 | 3.989541e-04 | 0.25 | 0 | 2 | 0.1 | auto | sigmoid | ... | -0.020972 | -0.021903 | -0.021816 | -0.02703 | -0.086462 | -0.084307 | -0.081021 | -0.083449 | -0.046131 | 0.030912 |
| 1 | 0.005087 | 0.001509 | 0.006483 | 6.529154e-03 | 0.25 | 0 | 2 | 0.1 | auto | linear | ... | -0.020972 | -0.021903 | -0.021816 | -0.02703 | -0.086055 | -0.075169 | -0.040656 | -0.056998 | -0.038495 | 0.024216 |
| 2 | 0.007779 | 0.008186 | 0.002992 | 9.419797e-07 | 0.25 | 0 | 2 | 0.1 | auto | poly | ... | -0.020972 | -0.021903 | -0.021816 | -0.02703 | -0.086306 | -0.088386 | -0.098377 | -0.098862 | -0.049800 | 0.035548 |
3 rows × 36 columns
optimised_svr = clf_svr.best_estimator_
joblib.dump(optimised_svr, 'svr_lag5_escalado.pkl')
['svr_lag5_escalado.pkl']
optimised_svr_lag5 = joblib.load('svr_lag5_escalado.pkl')
y_pred = optimised_svr_lag5.predict(X_test_5)
RMSEs_test_lag5_SVR = []
MAEs_test_lag5_SVR = []
R2s_test_lag5_SVR = []
RMSEs_test_lag5_SVR.append(RMSE(y_pred, y_test_5))
MAEs_test_lag5_SVR.append(MAE(y_pred, y_test_5))
R2s_test_lag5_SVR.append(R2(y_pred, y_test_5))
RMSE = 0.13 Mean absolute error = 0.08 R2 = 0.8
grafica(y_pred, y_test_5, -1)
predicciones_svr_lag5 = calcular_predicciones_lag5(optimised_svr_lag5, y_pred, X_test_5)
visualizar_predicciones(y_test_5, predicciones_svr_lag5)
MAE_SVR, RMSE_SVR, R2_SVR = graficas_errores(predicciones_svr_lag5, y_test_5)
Mean absolute error = 0.09 RMSE = 0.14 R2 = 0.77
Mean absolute error = 0.1 RMSE = 0.15 R2 = 0.73
Mean absolute error = 0.11 RMSE = 0.17 R2 = 0.67
Mean absolute error = 0.14 RMSE = 0.21 R2 = 0.5
Mean absolute error = 0.14 RMSE = 0.21 R2 = 0.47
Mean absolute error = 0.16 RMSE = 0.24 R2 = 0.32
Mean absolute error = 0.19 RMSE = 0.27 R2 = 0.13
for i in range(0, len(RMSE_SVR)):
RMSEs_test_lag5_SVR.append(RMSE_SVR[i])
MAEs_test_lag5_SVR.append(MAE_SVR[i])
R2s_test_lag5_SVR.append(R2_SVR[i])
lasso = Lasso(random_state = 42)
(1 / (2 n_samples)) ||y - Xw||^2_2 + alpha * ||w||_1
lasso.get_params()
{'alpha': 1.0,
'copy_X': True,
'fit_intercept': True,
'max_iter': 1000,
'normalize': False,
'positive': False,
'precompute': False,
'random_state': 42,
'selection': 'cyclic',
'tol': 0.0001,
'warm_start': False}
grid = {#'alpha' : [0.9, 1, 1.1, 1.2, 1.3, 1.5],
'alpha': np.logspace(-10, 3, 200),
'max_iter' : [1000, 3000, 5000, 7000, 10000],
'tol' : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5]}
clf_lasso = search_grid(lasso, grid, X_train_3, y_train_3)
clf_lasso.best_params_
{'alpha': 2.768286630392067e-06, 'max_iter': 3000, 'tol': 0.001}
lasso_results = pd.DataFrame(clf_lasso.cv_results_)#.sort_values(by = ['rank_test_score'])
#lasso_results.head(3)
#lasso_results[
# ['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]
lasso_results.head(3)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_alpha | param_max_iter | param_tol | params | split0_test_score | split1_test_score | ... | split2_train_score | split3_train_score | split4_train_score | split5_train_score | split6_train_score | split7_train_score | split8_train_score | split9_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.013264 | 0.001003 | 0.003292 | 0.000458 | 1e-10 | 1000 | 0.0001 | {'alpha': 1e-10, 'max_iter': 1000, 'tol': 0.0001} | -0.017193 | -0.005675 | ... | -0.003463 | -0.00318 | -0.003783 | -0.006043 | -0.01041 | -0.017316 | -0.023809 | -0.031761 | -0.010616 | 0.009749 |
| 1 | 0.013962 | 0.001945 | 0.003890 | 0.001509 | 1e-10 | 1000 | 0.001 | {'alpha': 1e-10, 'max_iter': 1000, 'tol': 0.001} | -0.017193 | -0.005675 | ... | -0.003463 | -0.00318 | -0.003783 | -0.006043 | -0.01041 | -0.017316 | -0.023809 | -0.031761 | -0.010616 | 0.009749 |
| 2 | 0.015558 | 0.003065 | 0.003590 | 0.001111 | 1e-10 | 1000 | 0.01 | {'alpha': 1e-10, 'max_iter': 1000, 'tol': 0.01} | -0.017193 | -0.005675 | ... | -0.003463 | -0.00318 | -0.003783 | -0.006043 | -0.01041 | -0.017316 | -0.023809 | -0.031761 | -0.010616 | 0.009749 |
3 rows × 33 columns
optimised_lasso = clf_lasso.best_estimator_
joblib.dump(optimised_lasso, 'lasso_lag3_escalado.pkl')
['lasso_lag3_escalado.pkl']
optimised_lasso_lag3 = joblib.load('lasso_lag3_escalado.pkl')
y_pred = optimised_lasso_lag3.predict(X_test_3)
RMSEs_test_lag3_lasso = []
MAEs_test_lag3_lasso = []
R2s_test_lag3_lasso = []
RMSEs_test_lag3_lasso.append(RMSE(y_pred, y_test_3))
MAEs_test_lag3_lasso.append(MAE(y_pred, y_test_3))
R2s_test_lag3_lasso.append(R2(y_pred, y_test_3))
RMSE = 0.13 Mean absolute error = 0.09 R2 = 0.79
grafica(y_pred, y_test_3, -1)
np.corrcoef(y_test_3, y_pred)
array([[1. , 0.90318007],
[0.90318007, 1. ]])
predicciones_lasso_lag3 = calcular_predicciones_lag3(optimised_lasso_lag3, y_pred, X_test_3)
visualizar_predicciones(y_test_3, predicciones_lasso_lag3)
MAE_lasso, RMSE_lasso, R2_lasso = graficas_errores(predicciones_lasso_lag3, y_test_3)
Mean absolute error = 0.1 RMSE = 0.16 R2 = 0.67
Mean absolute error = 0.13 RMSE = 0.2 R2 = 0.54
Mean absolute error = 0.13 RMSE = 0.21 R2 = 0.49
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.36
Mean absolute error = 0.17 RMSE = 0.26 R2 = 0.23
Mean absolute error = 0.17 RMSE = 0.27 R2 = 0.16
Mean absolute error = 0.19 RMSE = 0.28 R2 = 0.05
for i in range(0, len(RMSE_lasso)):
RMSEs_test_lag3_lasso.append(RMSE_lasso[i])
MAEs_test_lag3_lasso.append(MAE_lasso[i])
R2s_test_lag3_lasso.append(R2_lasso[i])
clf_lasso = search_grid(lasso, grid, X_train_5, y_train_5)
clf_lasso.best_params_
{'alpha': 2.768286630392067e-06, 'max_iter': 5000, 'tol': 0.001}
lasso_results = pd.DataFrame(clf_lasso.cv_results_)#.sort_values(by = ['rank_test_score'])
#lasso_results.head(3)
#lasso_results[
# ['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]
lasso_results.head(3)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_alpha | param_max_iter | param_tol | params | split0_test_score | split1_test_score | ... | split2_train_score | split3_train_score | split4_train_score | split5_train_score | split6_train_score | split7_train_score | split8_train_score | split9_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.025032 | 0.009633 | 0.004290 | 0.001611 | 1e-10 | 1000 | 0.0001 | {'alpha': 1e-10, 'max_iter': 1000, 'tol': 0.0001} | -0.024179 | -0.007147 | ... | -0.002734 | -0.00264 | -0.003072 | -0.00512 | -0.008976 | -0.014611 | -0.021445 | -0.029719 | -0.009168 | 0.009241 |
| 1 | 0.021642 | 0.004325 | 0.004389 | 0.001351 | 1e-10 | 1000 | 0.001 | {'alpha': 1e-10, 'max_iter': 1000, 'tol': 0.001} | -0.024179 | -0.007147 | ... | -0.002734 | -0.00264 | -0.003072 | -0.00512 | -0.008976 | -0.014611 | -0.021445 | -0.029719 | -0.009168 | 0.009241 |
| 2 | 0.020645 | 0.007261 | 0.002992 | 0.000446 | 1e-10 | 1000 | 0.01 | {'alpha': 1e-10, 'max_iter': 1000, 'tol': 0.01} | -0.018363 | -0.007147 | ... | -0.002734 | -0.00264 | -0.003072 | -0.00512 | -0.008976 | -0.014611 | -0.021445 | -0.029719 | -0.009258 | 0.009163 |
3 rows × 33 columns
optimised_lasso = clf_lasso.best_estimator_
joblib.dump(optimised_lasso, 'lasso_lag5_escalado.pkl')
['lasso_lag5_escalado.pkl']
optimised_lasso_lag5 = joblib.load('lasso_lag5_escalado.pkl')
y_pred = optimised_lasso_lag5.predict(X_test_5)
RMSEs_test_lag5_lasso = []
MAEs_test_lag5_lasso = []
R2s_test_lag5_lasso = []
RMSEs_test_lag5_lasso.append(RMSE(y_pred, y_test_5))
MAEs_test_lag5_lasso.append(MAE(y_pred, y_test_5))
R2s_test_lag5_lasso.append(R2(y_pred, y_test_5))
RMSE = 0.13 Mean absolute error = 0.08 R2 = 0.8
grafica(y_pred, y_test_5, -1)
np.corrcoef(y_test_5, y_pred)
array([[1. , 0.90748449],
[0.90748449, 1. ]])
predicciones_lasso_lag5 = calcular_predicciones_lag5(optimised_lasso_lag5, y_pred, X_test_5)
visualizar_predicciones(y_test_5, predicciones_lasso_lag5)
MAE_lasso, RMSE_lasso, R2_lasso = graficas_errores(predicciones_lasso_lag5, y_test_5)
Mean absolute error = 0.09 RMSE = 0.14 R2 = 0.75
Mean absolute error = 0.11 RMSE = 0.16 R2 = 0.69
Mean absolute error = 0.12 RMSE = 0.18 R2 = 0.61
Mean absolute error = 0.15 RMSE = 0.22 R2 = 0.44
Mean absolute error = 0.16 RMSE = 0.23 R2 = 0.38
Mean absolute error = 0.17 RMSE = 0.25 R2 = 0.24
Mean absolute error = 0.19 RMSE = 0.28 R2 = 0.09
for i in range(0, len(RMSE_lasso)):
RMSEs_test_lag5_lasso.append(RMSE_lasso[i])
MAEs_test_lag5_lasso.append(MAE_lasso[i])
R2s_test_lag5_lasso.append(R2_lasso[i])
La idea de este modelo es construir árboles en serie de forma que un árbol posterior intenta corregir los errores de todos los anteriores. Parámetros importantes son la profundidad de los árboles, cuándo hacer pre-prunning (útil para evitar el overfitting) y también el ratio de aprendizaje o learning_rate que controla como se esfuerza el árbol actual en corregir los errores de los árboles anteriores. También podemos aumentar el número de árboles con el parámetro n_estimators. Al contrario que en random forest, usar un número de árboles muy alto en gradient llevará a un modelo mucho más complejo que puede caer en overfitting. Max_depth suele variar entre 1 y 5
# Instancia del modelo
gbr = GradientBoostingRegressor(random_state = 42)
gbr.get_params()
{'alpha': 0.9,
'ccp_alpha': 0.0,
'criterion': 'friedman_mse',
'init': None,
'learning_rate': 0.1,
'loss': 'ls',
'max_depth': 3,
'max_features': None,
'max_leaf_nodes': None,
'min_impurity_decrease': 0.0,
'min_impurity_split': None,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 100,
'n_iter_no_change': None,
'presort': 'deprecated',
'random_state': 42,
'subsample': 1.0,
'tol': 0.0001,
'validation_fraction': 0.1,
'verbose': 0,
'warm_start': False}
grid = {'n_estimators':[50, 100, 250, 500],
'max_depth':[3, 4, 5, 8, 10],
'min_samples_leaf':[4, 8, 10],
'max_features':[2, 3, 5]}
clf_gbr = search_grid(gbr, grid, X_train_3, y_train_3)
clf_gbr.best_params_
{'max_depth': 3, 'max_features': 5, 'min_samples_leaf': 4, 'n_estimators': 50}
gbr_results = pd.DataFrame(clf_gbr.cv_results_)
gbr_results.head(3)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_max_depth | param_max_features | param_min_samples_leaf | param_n_estimators | params | split0_test_score | ... | split2_train_score | split3_train_score | split4_train_score | split5_train_score | split6_train_score | split7_train_score | split8_train_score | split9_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.026728 | 0.003301 | 0.003591 | 0.000488 | 3 | 2 | 4 | 50 | {'max_depth': 3, 'max_features': 2, 'min_sampl... | -0.011109 | ... | -0.000984 | -0.000940 | -0.001136 | -0.001857 | -0.002766 | -0.004857 | -0.007097 | -0.010549 | -0.003219 | 0.003128 |
| 1 | 0.048071 | 0.005330 | 0.003292 | 0.000457 | 3 | 2 | 4 | 100 | {'max_depth': 3, 'max_features': 2, 'min_sampl... | -0.011064 | ... | -0.000477 | -0.000515 | -0.000733 | -0.001076 | -0.001635 | -0.003014 | -0.004173 | -0.006568 | -0.001894 | 0.001972 |
| 2 | 0.106315 | 0.010900 | 0.003392 | 0.000489 | 3 | 2 | 4 | 250 | {'max_depth': 3, 'max_features': 2, 'min_sampl... | -0.011034 | ... | -0.000093 | -0.000138 | -0.000268 | -0.000416 | -0.000569 | -0.001099 | -0.001645 | -0.002426 | -0.000677 | 0.000763 |
3 rows × 34 columns
optimised_gbr = clf_gbr.best_estimator_
joblib.dump(optimised_gbr, 'gbr_lag3_escalado.pkl')
['gbr_lag3_escalado.pkl']
optimised_gbr_lag3 = joblib.load('gbr_lag3_escalado.pkl')
y_pred = optimised_gbr_lag3.predict(X_test_3)
RMSEs_test_lag3_gbr = []
MAEs_test_lag3_gbr = []
R2s_test_lag3_gbr = []
RMSEs_test_lag3_gbr.append(RMSE(y_pred, y_test_3))
MAEs_test_lag3_gbr.append(MAE(y_pred, y_test_3))
R2s_test_lag3_gbr.append(R2(y_pred, y_test_3))
RMSE = 0.21 Mean absolute error = 0.13 R2 = 0.45
grafica(y_pred, y_test_3, -1)
np.corrcoef(y_test_3, y_pred)
array([[1. , 0.8521839],
[0.8521839, 1. ]])
predicciones_gbr_lag3 = calcular_predicciones_lag3(optimised_gbr_lag3, y_pred, X_test_3)
visualizar_predicciones(y_test_3, predicciones_gbr_lag3)
MAE_gbr, RMSE_gbr, R2_gbr = graficas_errores(predicciones_gbr_lag3, y_test_3)
Mean absolute error = 0.14 RMSE = 0.22 R2 = 0.39
Mean absolute error = 0.14 RMSE = 0.22 R2 = 0.4
Mean absolute error = 0.14 RMSE = 0.23 R2 = 0.39
Mean absolute error = 0.14 RMSE = 0.23 R2 = 0.39
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.38
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.38
Mean absolute error = 0.15 RMSE = 0.23 R2 = 0.37
for i in range(0, len(RMSE_gbr)):
RMSEs_test_lag3_gbr.append(RMSE_gbr[i])
MAEs_test_lag3_gbr.append(MAE_gbr[i])
R2s_test_lag3_gbr.append(R2_gbr[i])
clf_gbr = search_grid(gbr, grid, X_train_5, y_train_5)
clf_gbr.best_params_
{'max_depth': 10,
'max_features': 5,
'min_samples_leaf': 10,
'n_estimators': 50}
gbr_results = pd.DataFrame(clf_gbr.cv_results_)
gbr_results.head(3)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_max_depth | param_max_features | param_min_samples_leaf | param_n_estimators | params | split0_test_score | ... | split2_train_score | split3_train_score | split4_train_score | split5_train_score | split6_train_score | split7_train_score | split8_train_score | split9_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.027028 | 0.004645 | 0.003990 | 0.001728 | 3 | 2 | 4 | 50 | {'max_depth': 3, 'max_features': 2, 'min_sampl... | -0.014645 | ... | -0.001422 | -0.001333 | -0.001570 | -0.002522 | -0.004138 | -0.008360 | -0.012606 | -0.017716 | -0.005256 | 0.005480 |
| 1 | 0.053158 | 0.005951 | 0.003590 | 0.000489 | 3 | 2 | 4 | 100 | {'max_depth': 3, 'max_features': 2, 'min_sampl... | -0.014442 | ... | -0.000582 | -0.000663 | -0.000868 | -0.001410 | -0.002275 | -0.004778 | -0.007014 | -0.010757 | -0.002930 | 0.003332 |
| 2 | 0.106315 | 0.012427 | 0.003591 | 0.000662 | 3 | 2 | 4 | 250 | {'max_depth': 3, 'max_features': 2, 'min_sampl... | -0.014455 | ... | -0.000112 | -0.000146 | -0.000245 | -0.000469 | -0.000746 | -0.001507 | -0.002315 | -0.003828 | -0.000947 | 0.001191 |
3 rows × 34 columns
optimised_gbr = clf_gbr.best_estimator_
joblib.dump(optimised_gbr, 'gbr_lag5_escalado.pkl')
['gbr_lag5_escalado.pkl']
optimised_gbr_lag5 = joblib.load('gbr_lag5_escalado.pkl')
y_pred = optimised_gbr_lag5.predict(X_test_5)
RMSEs_test_lag5_gbr = []
MAEs_test_lag5_gbr = []
R2s_test_lag5_gbr = []
RMSEs_test_lag5_gbr.append(RMSE(y_pred, y_test_5))
MAEs_test_lag5_gbr.append(MAE(y_pred, y_test_5))
R2s_test_lag5_gbr.append(R2(y_pred, y_test_5))
RMSE = 0.21 Mean absolute error = 0.13 R2 = 0.46
grafica(y_pred, y_test_5, -1)
np.corrcoef(y_test_5, y_pred)
array([[1. , 0.85991689],
[0.85991689, 1. ]])
predicciones_gbr_lag5 = calcular_predicciones_lag5(optimised_gbr_lag5, y_pred, X_test_5)
visualizar_predicciones(y_test_5, predicciones_gbr_lag5)
MAE_gbr, RMSE_gbr, R2_gbr = graficas_errores(predicciones_gbr_lag5, y_test_5)
Mean absolute error = 0.14 RMSE = 0.21 R2 = 0.44
Mean absolute error = 0.14 RMSE = 0.22 R2 = 0.44
Mean absolute error = 0.14 RMSE = 0.22 R2 = 0.44
Mean absolute error = 0.14 RMSE = 0.22 R2 = 0.44
Mean absolute error = 0.14 RMSE = 0.22 R2 = 0.44
Mean absolute error = 0.14 RMSE = 0.22 R2 = 0.43
Mean absolute error = 0.14 RMSE = 0.22 R2 = 0.42
for i in range(0, len(RMSE_gbr)):
RMSEs_test_lag5_gbr.append(RMSE_gbr[i])
MAEs_test_lag5_gbr.append(MAE_gbr[i])
R2s_test_lag5_gbr.append(R2_gbr[i])
Minimiza la función objetivo:
1 / (2 n_samples) ||y - Xw||^2_2 + alpha l1_ratio ||w||_1 + 0.5 alpha (1 - l1_ratio) * ||w||^2_2 =
a ||w||_1 + 0.5 b * ||w||_2^2 donde
alpha = a + b y l1_ratio = a / (a + b)
elastic_net = ElasticNet(random_state = 42)
grid = {'alpha' : [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0],
'l1_ratio' : np.arange(0, 1, 0.01),
'max_iter' : [10000,15000,20000],
'tol' : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5]}
clf_en = search_grid(elastic_net, grid, X_train_3, y_train_3)
clf_en.best_params_
{'alpha': 1e-05, 'l1_ratio': 0.17, 'max_iter': 10000, 'tol': 0.001}
en_results = pd.DataFrame(clf_en.cv_results_)
en_results.head(3)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_alpha | param_l1_ratio | param_max_iter | param_tol | params | split0_test_score | ... | split2_train_score | split3_train_score | split4_train_score | split5_train_score | split6_train_score | split7_train_score | split8_train_score | split9_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.101225 | 0.029870 | 0.003756 | 0.004432 | 1e-05 | 0 | 10000 | 0.0001 | {'alpha': 1e-05, 'l1_ratio': 0.0, 'max_iter': ... | -0.010726 | ... | -0.003562 | -0.003286 | -0.003884 | -0.006108 | -0.010456 | -0.017418 | -0.023895 | -0.031797 | -0.010722 | 0.009718 |
| 1 | 0.088894 | 0.018066 | 0.001396 | 0.001424 | 1e-05 | 0 | 10000 | 0.001 | {'alpha': 1e-05, 'l1_ratio': 0.0, 'max_iter': ... | -0.010726 | ... | -0.003562 | -0.003286 | -0.003884 | -0.006108 | -0.010456 | -0.017418 | -0.023895 | -0.031797 | -0.010722 | 0.009718 |
| 2 | 0.084987 | 0.022400 | 0.004686 | 0.007158 | 1e-05 | 0 | 10000 | 0.01 | {'alpha': 1e-05, 'l1_ratio': 0.0, 'max_iter': ... | -0.010726 | ... | -0.003562 | -0.003286 | -0.003884 | -0.006108 | -0.010456 | -0.017418 | -0.023895 | -0.031797 | -0.010722 | 0.009718 |
3 rows × 34 columns
optimised_en = clf_en.best_estimator_
joblib.dump(optimised_en, 'en_lag3_escalado.pkl')
['en_lag3_escalado.pkl']
optimised_en_lag3 = joblib.load('en_lag3_escalado.pkl')
y_pred = optimised_en_lag3.predict(X_test_3)
RMSEs_test_lag3_en = []
MAEs_test_lag3_en = []
R2s_test_lag3_en = []
RMSEs_test_lag3_en.append(RMSE(y_pred, y_test_3))
MAEs_test_lag3_en.append(MAE(y_pred, y_test_3))
R2s_test_lag3_en.append(R2(y_pred, y_test_3))
RMSE = 0.13 Mean absolute error = 0.09 R2 = 0.79
grafica(y_pred, y_test_3, -1)
np.corrcoef(y_test_3, y_pred)
array([[1. , 0.90394023],
[0.90394023, 1. ]])
predicciones_en_lag3 = calcular_predicciones_lag3(optimised_en_lag3, y_pred, X_test_3)
visualizar_predicciones(y_test_3, predicciones_en_lag3)
MAE_en, RMSE_en, R2_en = graficas_errores(predicciones_en_lag3, y_test_3)
Mean absolute error = 0.1 RMSE = 0.16 R2 = 0.69
Mean absolute error = 0.12 RMSE = 0.19 R2 = 0.58
Mean absolute error = 0.13 RMSE = 0.2 R2 = 0.53
Mean absolute error = 0.14 RMSE = 0.22 R2 = 0.43
Mean absolute error = 0.16 RMSE = 0.24 R2 = 0.33
Mean absolute error = 0.17 RMSE = 0.25 R2 = 0.27
Mean absolute error = 0.17 RMSE = 0.26 R2 = 0.19
for i in range(0, len(RMSE_en)):
RMSEs_test_lag3_en.append(RMSE_en[i])
MAEs_test_lag3_en.append(MAE_en[i])
R2s_test_lag3_en.append(R2_en[i])
clf_en = search_grid(elastic_net, grid, X_train_5, y_train_5)
clf_en.best_params_
{'alpha': 1e-05, 'l1_ratio': 0.13, 'max_iter': 10000, 'tol': 0.0001}
en_results = pd.DataFrame(clf_en.cv_results_)
en_results.head(3)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_alpha | param_l1_ratio | param_max_iter | param_tol | params | split0_test_score | ... | split2_train_score | split3_train_score | split4_train_score | split5_train_score | split6_train_score | split7_train_score | split8_train_score | split9_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.136534 | 0.031766 | 0.002992 | 2.185142e-07 | 1e-05 | 0 | 10000 | 0.0001 | {'alpha': 1e-05, 'l1_ratio': 0.0, 'max_iter': ... | -0.013791 | ... | -0.003027 | -0.002863 | -0.003289 | -0.005275 | -0.008989 | -0.014612 | -0.021377 | -0.029597 | -0.009384 | 0.009027 |
| 1 | 0.144114 | 0.020392 | 0.003192 | 3.989941e-04 | 1e-05 | 0 | 10000 | 0.001 | {'alpha': 1e-05, 'l1_ratio': 0.0, 'max_iter': ... | -0.013791 | ... | -0.003027 | -0.002863 | -0.003289 | -0.005275 | -0.008989 | -0.014612 | -0.021377 | -0.029597 | -0.009384 | 0.009027 |
| 2 | 0.135238 | 0.031426 | 0.003591 | 1.197243e-03 | 1e-05 | 0 | 10000 | 0.01 | {'alpha': 1e-05, 'l1_ratio': 0.0, 'max_iter': ... | -0.013791 | ... | -0.003027 | -0.002863 | -0.003289 | -0.005275 | -0.008989 | -0.014612 | -0.021377 | -0.029597 | -0.009384 | 0.009027 |
3 rows × 34 columns
optimised_en = clf_en.best_estimator_
joblib.dump(optimised_en, 'en_lag5_escalado.pkl')
['en_lag5_escalado.pkl']
optimised_en_lag5 = joblib.load('en_lag5_escalado.pkl')
y_pred = optimised_en_lag5.predict(X_test_5)
RMSEs_test_lag5_en = []
MAEs_test_lag5_en = []
R2s_test_lag5_en = []
RMSEs_test_lag5_en.append(RMSE(y_pred, y_test_5))
MAEs_test_lag5_en.append(MAE(y_pred, y_test_5))
R2s_test_lag5_en.append(R2(y_pred, y_test_5))
RMSE = 0.13 Mean absolute error = 0.08 R2 = 0.8
grafica(y_pred, y_test_5, -1)
np.corrcoef(y_test_5, y_pred)
array([[1. , 0.90821593],
[0.90821593, 1. ]])
predicciones_en_lag5 = calcular_predicciones_lag5(optimised_en_lag5, y_pred, X_test_5)
visualizar_predicciones(y_test_5, predicciones_en_lag5)
MAE_en, RMSE_en, R2_en = graficas_errores(predicciones_en_lag5, y_test_5)
Mean absolute error = 0.09 RMSE = 0.14 R2 = 0.77
Mean absolute error = 0.1 RMSE = 0.15 R2 = 0.71
Mean absolute error = 0.11 RMSE = 0.17 R2 = 0.64
Mean absolute error = 0.14 RMSE = 0.21 R2 = 0.48
Mean absolute error = 0.15 RMSE = 0.22 R2 = 0.42
Mean absolute error = 0.17 RMSE = 0.24 R2 = 0.3
Mean absolute error = 0.19 RMSE = 0.27 R2 = 0.16
for i in range(0, len(RMSE_en)):
RMSEs_test_lag5_en.append(RMSE_en[i])
MAEs_test_lag5_en.append(MAE_en[i])
R2s_test_lag5_en.append(R2_en[i])
Para cada uno de los modelos hemos obtenido los mejores hiperparámetros. Ahora volveremos a entrenar los modelos usando validación cruzada de 10 pliegues (time series) y obteniendo para cada pliegue los errores RMSE, MAE y R^2
cv = TimeSeriesSplit(n_splits = 10)
scorings = ['r2', 'neg_root_mean_squared_error', 'neg_median_absolute_error']
def obtener_r2(scores):
return scores['test_r2']
def obtener_rmse(scores):
return (-1) * scores['test_neg_root_mean_squared_error']
def obtener_mae(scores):
return (-1) * scores['test_neg_median_absolute_error']
Listas para añadir los errores de los splits para cada modelo en cada lag
RMSEs_train = []
R2s_train = []
MAEs_train = []
Una vez obtenidos todos los errores para cada uno de los modelos entonces creamos 6 dataframes, uno por error, tres para lag3 y otros tres para lag5. Las columnas son los modelos y las filas son los resultados de cada uno de los 10 folds
columnas = ['RF_lag3', 'RF_lag5', 'LR_lag3', 'LR_lag5', 'Lasso_lag3', 'Lasso_lag5', 'SVR_lag3', 'SVR_lag5', 'GBR_lag3', 'GBR_lag5', 'EN_lag3', 'EN_lag5' ]
filas_train = []
for i in range(1,11):
filas_train.append('Fold ' + str(i))
def crear_dataframe_error_train(columnas, filas, errores):
df = pd.DataFrame(0, columns = columnas, index = filas)
df.index.name = 'Pliegues'
for i in range(0, 12):
df.iloc[:,i] = errores[i]
return df
optimised_rf_lag3 = joblib.load('rf_lag3_escalado.pkl')
optimised_rf_lag5 = joblib.load('rf_lag5_escalado.pkl')
scores_RF_lag3 = cross_validate(optimised_rf_lag3, X_train_3, y_train_3, cv = cv, scoring = scorings)
scores_RF_lag5 = cross_validate(optimised_rf_lag5, X_train_5, y_train_5, cv = cv, scoring = scorings)
R2s_train.append(obtener_r2(scores_RF_lag3))
RMSEs_train.append(obtener_rmse(scores_RF_lag3))
MAEs_train.append(obtener_mae(scores_RF_lag3))
R2s_train.append(obtener_r2(scores_RF_lag5))
RMSEs_train.append(obtener_rmse(scores_RF_lag5))
MAEs_train.append(obtener_mae(scores_RF_lag5))
regresion_lineal = LinearRegression()
scores_LR_lag3 = cross_validate(regresion_lineal, X_train_3, y_train_3, cv = cv, scoring = scorings)
scores_LR_lag5 = cross_validate(regresion_lineal, X_train_5, y_train_5, cv = cv, scoring = scorings)
R2s_train.append(obtener_r2(scores_LR_lag3))
RMSEs_train.append(obtener_rmse(scores_LR_lag3))
MAEs_train.append(obtener_mae(scores_LR_lag3))
R2s_train.append(obtener_r2(scores_LR_lag5))
RMSEs_train.append(obtener_rmse(scores_LR_lag5))
MAEs_train.append(obtener_mae(scores_LR_lag5))
optimised_lasso_lag3 = joblib.load('lasso_lag3_escalado.pkl')
optimised_lasso_lag5 = joblib.load('lasso_lag5_escalado.pkl')
scores_lasso_lag3 = cross_validate(optimised_lasso_lag3, X_train_3, y_train_3, cv = cv, scoring = scorings)
scores_lasso_lag5 = cross_validate(optimised_lasso_lag5, X_train_5, y_train_5, cv = cv, scoring = scorings)
R2s_train.append(obtener_r2(scores_lasso_lag3))
RMSEs_train.append(obtener_rmse(scores_lasso_lag3))
MAEs_train.append(obtener_mae(scores_lasso_lag3))
R2s_train.append(obtener_r2(scores_lasso_lag5))
RMSEs_train.append(obtener_rmse(scores_lasso_lag5))
MAEs_train.append(obtener_mae(scores_lasso_lag5))
optimised_svr_lag3 = joblib.load('svr_lag3_escalado.pkl')
optimised_svr_lag5 = joblib.load('svr_lag5_escalado.pkl')
scores_svr_lag3 = cross_validate(optimised_svr_lag3, X_train_3, y_train_3, cv = cv, scoring = scorings)
scores_svr_lag5 = cross_validate(optimised_svr_lag5, X_train_5, y_train_5, cv = cv, scoring = scorings)
R2s_train.append(obtener_r2(scores_svr_lag3))
RMSEs_train.append(obtener_rmse(scores_svr_lag3))
MAEs_train.append(obtener_mae(scores_svr_lag3))
R2s_train.append(obtener_r2(scores_svr_lag5))
RMSEs_train.append(obtener_rmse(scores_svr_lag5))
MAEs_train.append(obtener_mae(scores_svr_lag5))
optimised_gbr_lag3 = joblib.load('gbr_lag3_escalado.pkl')
optimised_gbr_lag5 = joblib.load('gbr_lag5_escalado.pkl')
scores_gbr_lag3 = cross_validate(optimised_gbr_lag3, X_train_3, y_train_3, cv = cv, scoring = scorings)
scores_gbr_lag5 = cross_validate(optimised_gbr_lag5, X_train_5, y_train_5, cv = cv, scoring = scorings)
R2s_train.append(obtener_r2(scores_gbr_lag3))
RMSEs_train.append(obtener_rmse(scores_gbr_lag3))
MAEs_train.append(obtener_mae(scores_gbr_lag3))
R2s_train.append(obtener_r2(scores_gbr_lag5))
RMSEs_train.append(obtener_rmse(scores_gbr_lag5))
MAEs_train.append(obtener_mae(scores_gbr_lag5))
optimised_en_lag3 = joblib.load('en_lag3_escalado.pkl')
optimised_en_lag5 = joblib.load('en_lag5_escalado.pkl')
scores_en_lag3 = cross_validate(optimised_en_lag3, X_train_3, y_train_3, cv = cv, scoring = scorings)
scores_en_lag5 = cross_validate(optimised_en_lag5, X_train_5, y_train_5, cv = cv, scoring = scorings)
R2s_train.append(obtener_r2(scores_en_lag3))
RMSEs_train.append(obtener_rmse(scores_en_lag3))
MAEs_train.append(obtener_mae(scores_en_lag3))
R2s_train.append(obtener_r2(scores_en_lag5))
RMSEs_train.append(obtener_rmse(scores_en_lag5))
MAEs_train.append(obtener_mae(scores_en_lag5))
Creamos los dataframes con los errores y los guardamos
RMSE_errores_train = crear_dataframe_error_train(columnas, filas_train, RMSEs_train)
RMSE_errores_train
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Pliegues | ||||||||||||
| Fold 1 | 0.010250 | 0.012253 | 0.040040 | 0.021895 | 0.010777 | 0.012175 | 0.015468 | 0.017608 | 0.009779 | 0.011421 | 0.009565 | 0.011302 |
| Fold 2 | 0.003465 | 0.005035 | 0.006011 | 0.008859 | 0.005831 | 0.004163 | 0.006113 | 0.004916 | 0.003236 | 0.005702 | 0.005437 | 0.004665 |
| Fold 3 | 0.002359 | 0.002950 | 0.002599 | 0.004666 | 0.002282 | 0.002253 | 0.007964 | 0.006088 | 0.002058 | 0.001663 | 0.002220 | 0.002200 |
| Fold 4 | 0.005817 | 0.005213 | 0.008027 | 0.008158 | 0.006893 | 0.007117 | 0.012659 | 0.018517 | 0.005418 | 0.005386 | 0.006962 | 0.007347 |
| Fold 5 | 0.019228 | 0.018612 | 0.015113 | 0.017661 | 0.014720 | 0.014294 | 0.018877 | 0.022049 | 0.018109 | 0.019953 | 0.015303 | 0.015526 |
| Fold 6 | 0.097717 | 0.099564 | 0.030118 | 0.027917 | 0.031447 | 0.029454 | 0.031626 | 0.026445 | 0.094637 | 0.099538 | 0.031492 | 0.028937 |
| Fold 7 | 0.054043 | 0.054168 | 0.047621 | 0.046455 | 0.048174 | 0.044944 | 0.050317 | 0.046399 | 0.049422 | 0.054665 | 0.048253 | 0.045141 |
| Fold 8 | 0.117732 | 0.113723 | 0.064359 | 0.061369 | 0.063967 | 0.061773 | 0.066249 | 0.059950 | 0.118328 | 0.118405 | 0.064465 | 0.062454 |
| Fold 9 | 0.124888 | 0.141807 | 0.085793 | 0.091884 | 0.085716 | 0.091056 | 0.089009 | 0.093796 | 0.121939 | 0.142605 | 0.085260 | 0.091053 |
| Fold 10 | 0.060135 | 0.064386 | 0.029916 | 0.038327 | 0.029938 | 0.036199 | 0.029868 | 0.036866 | 0.048429 | 0.046525 | 0.029645 | 0.035670 |
RMSE_errores_train.to_csv(path_or_buf = '../Datos_preprocesados/RMSE_errores_train.csv')
RMSE_errores_train = read_csv('../Datos_preprocesados/RMSE_errores_train.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
RMSE_errores_train = RMSE_errores_train.set_index('Pliegues')
MAE_errores_train = crear_dataframe_error_train(columnas, filas_train, MAEs_train)
MAE_errores_train
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Pliegues | ||||||||||||
| Fold 1 | 0.008462 | 0.011392 | 0.032479 | 0.014992 | 0.007227 | 0.007082 | 0.012623 | 0.017494 | 0.007774 | 0.009879 | 0.006476 | 0.005981 |
| Fold 2 | 0.003400 | 0.005021 | 0.004186 | 0.003734 | 0.004024 | 0.002734 | 0.005366 | 0.003984 | 0.003021 | 0.005115 | 0.003658 | 0.003443 |
| Fold 3 | 0.001714 | 0.002173 | 0.001866 | 0.003266 | 0.001329 | 0.001685 | 0.005990 | 0.004277 | 0.001440 | 0.001243 | 0.001306 | 0.001632 |
| Fold 4 | 0.002247 | 0.002934 | 0.003994 | 0.002763 | 0.002256 | 0.002159 | 0.007597 | 0.010134 | 0.001828 | 0.001652 | 0.001959 | 0.002001 |
| Fold 5 | 0.012167 | 0.013229 | 0.011721 | 0.007150 | 0.011192 | 0.012083 | 0.014557 | 0.018390 | 0.012484 | 0.013561 | 0.012501 | 0.012333 |
| Fold 6 | 0.084448 | 0.087158 | 0.016943 | 0.016096 | 0.017917 | 0.016959 | 0.021310 | 0.012787 | 0.082231 | 0.087985 | 0.020494 | 0.016918 |
| Fold 7 | 0.033824 | 0.033456 | 0.026278 | 0.026127 | 0.025823 | 0.027261 | 0.034723 | 0.028557 | 0.022534 | 0.032382 | 0.026455 | 0.025546 |
| Fold 8 | 0.065309 | 0.056947 | 0.036142 | 0.032730 | 0.035894 | 0.033170 | 0.040139 | 0.031767 | 0.074404 | 0.061051 | 0.037323 | 0.032217 |
| Fold 9 | 0.098532 | 0.118379 | 0.052798 | 0.042889 | 0.055291 | 0.043395 | 0.059637 | 0.043899 | 0.099885 | 0.115925 | 0.054401 | 0.042336 |
| Fold 10 | 0.043867 | 0.055397 | 0.020134 | 0.024806 | 0.020085 | 0.023185 | 0.018272 | 0.020647 | 0.030652 | 0.040988 | 0.019380 | 0.022019 |
MAE_errores_train.to_csv(path_or_buf = '../Datos_preprocesados/MAE_errores_train.csv')
MAE_errores_train = read_csv('../Datos_preprocesados/MAE_errores_train.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
MAE_errores_train = MAE_errores_train.set_index('Pliegues')
R2_errores_train = crear_dataframe_error_train(columnas, filas_train, R2s_train)
R2_errores_train
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Pliegues | ||||||||||||
| Fold 1 | -29.694342 | -42.863114 | -467.380863 | -139.060110 | -32.932006 | -42.304050 | -68.902018 | -89.585501 | -26.941164 | -37.108063 | -25.730847 | -36.321997 |
| Fold 2 | -12.057997 | -26.565429 | -38.290436 | -84.342924 | -35.962679 | -17.841906 | -39.632676 | -25.276891 | -10.383378 | -34.350915 | -31.139927 | -22.665800 |
| Fold 3 | -2.497690 | -4.467829 | -3.245419 | -12.683312 | -2.272921 | -2.189557 | -38.856047 | -22.291884 | -1.661564 | -0.737697 | -2.098041 | -2.041702 |
| Fold 4 | 0.208589 | 0.364302 | -0.507292 | -0.556680 | -0.111366 | -0.184671 | -2.748629 | -7.020625 | 0.313296 | 0.321470 | -0.133801 | -0.262629 |
| Fold 5 | -0.550522 | -0.452815 | 0.042057 | -0.308119 | 0.091258 | 0.143046 | -0.494423 | -1.038993 | -0.375323 | -0.669630 | 0.017847 | -0.010932 |
| Fold 6 | -3.250044 | -3.412252 | 0.596246 | 0.653112 | 0.559831 | 0.613866 | 0.554808 | 0.688739 | -2.986368 | -3.409879 | 0.558570 | 0.627305 |
| Fold 7 | -0.253955 | -0.259751 | 0.026343 | 0.073442 | 0.003598 | 0.132760 | -0.086999 | 0.075686 | -0.048672 | -0.283003 | 0.000338 | 0.125114 |
| Fold 8 | -0.479284 | -0.380258 | 0.557943 | 0.598060 | 0.563308 | 0.592751 | 0.531592 | 0.616436 | -0.494295 | -0.496241 | 0.556486 | 0.583722 |
| Fold 9 | 0.035836 | -0.243110 | 0.544993 | 0.478093 | 0.545815 | 0.487454 | 0.510247 | 0.456143 | 0.080832 | -0.257130 | 0.550628 | 0.487489 |
| Fold 10 | -2.836451 | -3.398041 | 0.050516 | -0.558438 | 0.049111 | -0.390182 | 0.053565 | -0.441872 | -1.488205 | -1.296420 | 0.067631 | -0.349833 |
R2_errores_train.to_csv(path_or_buf = '../Datos_preprocesados/R2_errores_train.csv')
R2_errores_train = read_csv('../Datos_preprocesados/R2_errores_train.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
R2_errores_train = R2_errores_train.set_index('Pliegues')
columnas = ['RF_lag3', 'RF_lag5', 'LR_lag3', 'LR_lag5', 'Lasso_lag3', 'Lasso_lag5', 'SVR_lag3', 'SVR_lag5', 'GBR_lag3', 'GBR_lag5', 'EN_lag3', 'EN_lag5' ]
filas_test = []
for i in range(1,9):
if i-1 == 0:
filas_test.append('Predicción t')
else:
filas_test.append('Predicción t+' + str(i-1))
def crear_dataframe_error_test(columnas, filas, errores):
df = pd.DataFrame(0, columns = columnas, index = filas)
df.index.name = 'Predicciones'
i = 0
for c in df.columns:
df[c] = errores[i]
i = i+1
return df
RMSEs_test = []
RMSEs_test.append(RMSEs_test_lag3_rf)
RMSEs_test.append(RMSEs_test_lag5_rf)
RMSEs_test.append(RMSEs_test_lag3_LR)
RMSEs_test.append(RMSEs_test_lag5_LR)
RMSEs_test.append(RMSEs_test_lag3_lasso)
RMSEs_test.append(RMSEs_test_lag5_lasso)
RMSEs_test.append(RMSEs_test_lag3_SVR)
RMSEs_test.append(RMSEs_test_lag5_SVR)
RMSEs_test.append(RMSEs_test_lag3_gbr)
RMSEs_test.append(RMSEs_test_lag5_gbr)
RMSEs_test.append(RMSEs_test_lag3_en)
RMSEs_test.append(RMSEs_test_lag5_en)
RMSE_errores_test = crear_dataframe_error_test(columnas, filas_test, RMSEs_test)
RMSE_errores_test
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predicciones | ||||||||||||
| Predicción t | 0.22 | 0.23 | 0.13 | 0.13 | 0.13 | 0.13 | 0.13 | 0.13 | 0.21 | 0.21 | 0.13 | 0.13 |
| Predicción t+1 | 0.23 | 0.23 | 0.18 | 0.16 | 0.16 | 0.14 | 0.16 | 0.14 | 0.22 | 0.21 | 0.16 | 0.14 |
| Predicción t+2 | 0.23 | 0.23 | 0.21 | 0.17 | 0.20 | 0.16 | 0.19 | 0.15 | 0.22 | 0.22 | 0.19 | 0.15 |
| Predicción t+3 | 0.23 | 0.23 | 0.23 | 0.19 | 0.21 | 0.18 | 0.20 | 0.17 | 0.23 | 0.22 | 0.20 | 0.17 |
| Predicción t+4 | 0.23 | 0.23 | 0.27 | 0.25 | 0.23 | 0.22 | 0.23 | 0.21 | 0.23 | 0.22 | 0.22 | 0.21 |
| Predicción t+5 | 0.23 | 0.24 | 0.30 | 0.25 | 0.26 | 0.23 | 0.26 | 0.21 | 0.23 | 0.22 | 0.24 | 0.22 |
| Predicción t+6 | 0.24 | 0.24 | 0.32 | 0.30 | 0.27 | 0.25 | 0.27 | 0.24 | 0.23 | 0.22 | 0.25 | 0.24 |
| Predicción t+7 | 0.24 | 0.24 | 0.35 | 0.33 | 0.28 | 0.28 | 0.30 | 0.27 | 0.23 | 0.22 | 0.26 | 0.27 |
RMSE_errores_test.to_csv(path_or_buf = '../Datos_preprocesados/RMSE_errores_test.csv')
RMSE_errores_test = read_csv('../Datos_preprocesados/RMSE_errores_test.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
RMSE_errores_test = RMSE_errores_test.set_index('Predicciones')
MAEs_test = []
MAEs_test.append(MAEs_test_lag3_rf)
MAEs_test.append(MAEs_test_lag5_rf)
MAEs_test.append(MAEs_test_lag3_LR)
MAEs_test.append(MAEs_test_lag5_LR)
MAEs_test.append(MAEs_test_lag3_lasso)
MAEs_test.append(MAEs_test_lag5_lasso)
MAEs_test.append(MAEs_test_lag3_SVR)
MAEs_test.append(MAEs_test_lag5_SVR)
MAEs_test.append(MAEs_test_lag3_gbr)
MAEs_test.append(MAEs_test_lag5_gbr)
MAEs_test.append(MAEs_test_lag3_en)
MAEs_test.append(MAEs_test_lag5_en)
MAE_errores_test = crear_dataframe_error_test(columnas, filas_test, MAEs_test)
MAE_errores_test
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predicciones | ||||||||||||
| Predicción t | 0.14 | 0.14 | 0.09 | 0.08 | 0.09 | 0.08 | 0.09 | 0.08 | 0.13 | 0.13 | 0.09 | 0.08 |
| Predicción t+1 | 0.15 | 0.15 | 0.11 | 0.11 | 0.10 | 0.09 | 0.10 | 0.09 | 0.14 | 0.14 | 0.10 | 0.09 |
| Predicción t+2 | 0.15 | 0.15 | 0.14 | 0.12 | 0.13 | 0.11 | 0.12 | 0.10 | 0.14 | 0.14 | 0.12 | 0.10 |
| Predicción t+3 | 0.15 | 0.15 | 0.15 | 0.13 | 0.13 | 0.12 | 0.13 | 0.11 | 0.14 | 0.14 | 0.13 | 0.11 |
| Predicción t+4 | 0.15 | 0.15 | 0.17 | 0.17 | 0.15 | 0.15 | 0.15 | 0.14 | 0.14 | 0.14 | 0.14 | 0.14 |
| Predicción t+5 | 0.15 | 0.15 | 0.19 | 0.17 | 0.17 | 0.16 | 0.17 | 0.14 | 0.15 | 0.14 | 0.16 | 0.15 |
| Predicción t+6 | 0.15 | 0.16 | 0.21 | 0.21 | 0.17 | 0.17 | 0.18 | 0.16 | 0.15 | 0.14 | 0.17 | 0.17 |
| Predicción t+7 | 0.16 | 0.16 | 0.23 | 0.23 | 0.19 | 0.19 | 0.20 | 0.19 | 0.15 | 0.14 | 0.17 | 0.19 |
MAE_errores_test.to_csv(path_or_buf = '../Datos_preprocesados/MAE_errores_test.csv')
MAE_errores_test = read_csv('../Datos_preprocesados/MAE_errores_test.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
MAE_errores_test = MAE_errores_test.set_index('Predicciones')
R2s_test = []
R2s_test.append(R2s_test_lag3_rf)
R2s_test.append(R2s_test_lag5_rf)
R2s_test.append(R2s_test_lag3_LR)
R2s_test.append(R2s_test_lag5_LR)
R2s_test.append(R2s_test_lag3_lasso)
R2s_test.append(R2s_test_lag5_lasso)
R2s_test.append(R2s_test_lag3_SVR)
R2s_test.append(R2s_test_lag5_SVR)
R2s_test.append(R2s_test_lag3_gbr)
R2s_test.append(R2s_test_lag5_gbr)
R2s_test.append(R2s_test_lag3_en)
R2s_test.append(R2s_test_lag5_en)
R2_errores_test = crear_dataframe_error_test(columnas, filas_test, R2s_test)
R2_errores_test
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predicciones | ||||||||||||
| Predicción t | 0.40 | 0.37 | 0.78 | 0.80 | 0.79 | 0.80 | 0.80 | 0.80 | 0.45 | 0.46 | 0.79 | 0.80 |
| Predicción t+1 | 0.37 | 0.35 | 0.61 | 0.69 | 0.67 | 0.75 | 0.69 | 0.77 | 0.39 | 0.44 | 0.69 | 0.77 |
| Predicción t+2 | 0.36 | 0.35 | 0.46 | 0.65 | 0.54 | 0.69 | 0.58 | 0.73 | 0.40 | 0.44 | 0.58 | 0.71 |
| Predicción t+3 | 0.36 | 0.35 | 0.39 | 0.56 | 0.49 | 0.61 | 0.53 | 0.67 | 0.39 | 0.44 | 0.53 | 0.64 |
| Predicción t+4 | 0.36 | 0.35 | 0.13 | 0.28 | 0.36 | 0.44 | 0.38 | 0.50 | 0.39 | 0.44 | 0.43 | 0.48 |
| Predicción t+5 | 0.35 | 0.34 | -0.03 | 0.24 | 0.23 | 0.38 | 0.23 | 0.47 | 0.38 | 0.44 | 0.33 | 0.42 |
| Predicción t+6 | 0.34 | 0.33 | -0.17 | -0.09 | 0.16 | 0.24 | 0.12 | 0.32 | 0.38 | 0.43 | 0.27 | 0.30 |
| Predicción t+7 | 0.34 | 0.33 | -0.41 | -0.28 | 0.05 | 0.09 | -0.04 | 0.13 | 0.37 | 0.42 | 0.19 | 0.16 |
R2_errores_test.to_csv(path_or_buf = '../Datos_preprocesados/R2_errores_test.csv')
R2_errores_test = read_csv('../Datos_preprocesados/R2_errores_test.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
R2_errores_test = R2_errores_test.set_index('Predicciones')